You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2010/02/04 13:42:03 UTC
svn commit: r906468 - in /lucene/java/trunk: ./ contrib/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/ contrib/analyzers/common/src/resources/org/apache/lucene/analysis...

Author: rmuir
Date: Thu Feb  4 12:41:56 2010
New Revision: 906468

URL: http://svn.apache.org/viewvc?rev=906468&view=rev
Log:
LUCENE-2234: Hindi Analyzer

Added:
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java   (with props)
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizationFilter.java   (with props)
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java   (with props)
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemFilter.java   (with props)
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemmer.java   (with props)
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/package.html   (with props)
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilter.java   (with props)
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java   (with props)
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java   (with props)
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/package.html   (with props)
    lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/
    lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt   (with props)
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java   (with props)
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiNormalizer.java   (with props)
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiStemmer.java   (with props)
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicNormalizer.java   (with props)
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicTokenizer.java   (with props)
Modified:
    lucene/java/trunk/NOTICE.txt
    lucene/java/trunk/contrib/CHANGES.txt

Modified: lucene/java/trunk/NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/NOTICE.txt?rev=906468&r1=906467&r2=906468&view=diff
==============================================================================
--- lucene/java/trunk/NOTICE.txt (original)
+++ lucene/java/trunk/NOTICE.txt Thu Feb  4 12:41:56 2010
@@ -28,6 +28,11 @@
 contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt.
 See http://members.unine.ch/jacques.savoy/clef/index.html.
 
+The Hindi analyzer (contrib/analyzers) comes with a default
+stopword list that is BSD-licensed created by Jacques Savoy.  The file resides in
+contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt.
+See http://members.unine.ch/jacques.savoy/clef/index.html.
+
 Includes lib/servlet-api-2.4.jar from  Apache Tomcat
 
 The SmartChineseAnalyzer source code (under contrib/analyzers) was

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=906468&r1=906467&r2=906468&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Thu Feb  4 12:41:56 2010
@@ -103,6 +103,8 @@
    character is now configurable. Its also up to 20% faster. 
    (Steven Rowe via Robert Muir)
 
+ * LUCENE-2234: Add a Hindi analyzer.  (Robert Muir)
+
 Build
 
  * LUCENE-2124: Moved the JDK-based collation support from contrib/collation 

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java Thu Feb  4 12:41:56 2010
@@ -0,0 +1,132 @@
+package org.apache.lucene.analysis.hi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.in.IndicNormalizationFilter;
+import org.apache.lucene.analysis.in.IndicTokenizer;
+import org.apache.lucene.util.Version;
+
+/**
+ * Analyzer for Hindi.
+ */
+public final class HindiAnalyzer extends StopwordAnalyzerBase {
+  private final Set<?> stemExclusionSet;
+  
+  /**
+   * File containing default Hindi stopwords.
+   * 
+   * Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html
+   * The stopword list is BSD-Licensed.
+   */
+  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+  private static final String STOPWORDS_COMMENT = "#";
+  
+  /**
+   * Returns an unmodifiable instance of the default stop-words set.
+   * @return an unmodifiable instance of the default stop-words set.
+   */
+  public static Set<?> getDefaultStopSet(){
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+   * accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final Set<?> DEFAULT_STOP_SET;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = loadStopwordSet(false, HindiAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words
+   * 
+   * @param version lucene compatibility version
+   * @param stopwords a stopword set
+   * @param stemExclusionSet a stemming exclusion set
+   */
+  public HindiAnalyzer(Version version, Set<?> stopwords, Set<?> stemExclusionSet) {
+    super(version, stopwords);
+    this.stemExclusionSet = CharArraySet.unmodifiableSet(
+        CharArraySet.copy(matchVersion, stemExclusionSet));
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words 
+   * 
+   * @param version lucene compatibility version
+   * @param stopwords a stopword set
+   */
+  public HindiAnalyzer(Version version, Set<?> stopwords) {
+    this(version, stopwords, CharArraySet.EMPTY_SET);
+  }
+  
+  /**
+   * Builds an analyzer with the default stop words:
+   * {@link #DEFAULT_STOPWORD_FILE}.
+   */
+  public HindiAnalyzer(Version version) {
+    this(version, DefaultSetHolder.DEFAULT_STOP_SET);
+  }
+
+  /**
+   * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
+   * {@link Reader}.
+   * 
+   * @return {@link TokenStreamComponents} built from a {@link IndicTokenizer}
+   *         filtered with {@link LowerCaseFilter}, 
+   *         {@link IndicNormalizationFilter},
+   *         {@link HindiNormalizationFilter},
+   *         {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided,
+   *         {@link HindiStemFilter}, and Hindi Stop words
+   */
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new IndicTokenizer(matchVersion, reader);
+    TokenStream result = new LowerCaseFilter(matchVersion, source);
+    if (!stemExclusionSet.isEmpty())
+      result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+    result = new IndicNormalizationFilter(result);
+    result = new HindiNormalizationFilter(result);
+    result = new StopFilter(matchVersion, result, stopwords);
+    result = new HindiStemFilter(result);
+    return new TokenStreamComponents(source, result);
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizationFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizationFilter.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizationFilter.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizationFilter.java Thu Feb  4 12:41:56 2010
@@ -0,0 +1,59 @@
+package org.apache.lucene.analysis.hi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link HindiNormalizer} to normalize the
+ * orthography.
+ * <p>
+ * In some cases the normalization may cause unrelated terms to conflate, so
+ * to prevent terms from being normalized use an instance of
+ * {@link KeywordMarkerTokenFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ * @see HindiNormalizer
+ */
+public final class HindiNormalizationFilter extends TokenFilter {
+
+  private final HindiNormalizer normalizer = new HindiNormalizer();
+  private final TermAttribute termAtt = addAttribute(TermAttribute.class);
+  private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
+  
+  public HindiNormalizationFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAtt.isKeyword())
+        termAtt.setTermLength(normalizer.normalize(termAtt.termBuffer(), 
+            termAtt.termLength()));
+      return true;
+    } 
+    return false;
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizationFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java Thu Feb  4 12:41:56 2010
@@ -0,0 +1,194 @@
+package org.apache.lucene.analysis.hi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Normalizer for Hindi.
+ * <p>
+ * Normalizes text to remove some differences in spelling variations.
+ * <p>
+ * Implements the Hindi-language specific algorithm specified in:
+ * <i>Word normalization in Indian languages</i>
+ * Prasad Pingali and Vasudeva Varma.
+ * http://web2py.iiit.ac.in/publications/default/download/inproceedings.pdf.3fe5b38c-02ee-41ce-9a8f-3e745670be32.pdf
+ * <p>
+ * with the following additions from <i>Hindi CLIR in Thirty Days</i>
+ * Leah S. Larkey, Margaret E. Connell, and Nasreen AbdulJaleel.
+ * http://maroo.cs.umass.edu/pub/web/getpdf.php?id=454:
+ * <ul>
+ *  <li>Internal Zero-width joiner and Zero-width non-joiners are removed
+ *  <li>In addition to chandrabindu, NA+halant is normalized to anusvara
+ * </ul>
+ * 
+ */
+public class HindiNormalizer {
+  /**
+   * Normalize an input buffer of Hindi text
+   * 
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   */
+  public int normalize(char s[], int len) {
+
+    for (int i = 0; i < len; i++) {
+      switch (s[i]) {
+        // dead n -> bindu
+      case '\u0928':
+        if (i + 1 < len && s[i + 1] == '\u094D') {
+          s[i] = '\u0902';
+          len = delete(s, i + 1, len);
+        }
+        break;
+      // candrabindu -> bindu
+      case '\u0901':
+        s[i] = '\u0902';
+        break;
+      // nukta deletions
+      case '\u093C':
+        len = delete(s, i, len);
+        i--;
+        break;      
+      case '\u0929':
+        s[i] = '\u0928';
+        break;
+      case '\u0931':
+        s[i] = '\u0930';
+        break;
+      case '\u0934':
+        s[i] = '\u0933';
+        break;
+      case '\u0958':
+        s[i] = '\u0915';
+        break;
+      case '\u0959':
+        s[i] = '\u0916';
+        break;
+      case '\u095A':
+        s[i] = '\u0917';
+        break;
+      case '\u095B':
+        s[i] = '\u091C';
+        break;
+      case '\u095C':
+        s[i] = '\u0921';
+        break;
+      case '\u095D':
+        s[i] = '\u0922';
+        break;
+      case '\u095E':
+        s[i] = '\u092B';
+        break;
+      case '\u095F':
+        s[i] = '\u092F';
+        break;
+        // zwj/zwnj -> delete
+      case '\u200D':
+      case '\u200C':
+        len = delete(s, i, len);
+        i--;
+        break;
+        // virama -> delete
+      case '\u094D':
+        len = delete(s, i, len);
+        i--;
+        break;
+        // chandra/short -> replace
+      case '\u0945':
+      case '\u0946':
+        s[i] = '\u0947';
+        break;
+      case '\u0949':
+      case '\u094A':
+        s[i] = '\u094B';
+        break;
+      case '\u090D':
+      case '\u090E':
+        s[i] = '\u090F';
+        break;
+      case '\u0911':
+      case '\u0912':
+        s[i] = '\u0913';
+        break;
+      case '\u0972':
+        s[i] = '\u0905';
+        break;
+        // long -> short ind. vowels
+      case '\u0906':
+        s[i] = '\u0905';
+        break;
+      case '\u0908':
+        s[i] = '\u0907';
+        break;
+      case '\u090A':
+        s[i] = '\u0909';
+        break;
+      case '\u0960':
+        s[i] = '\u090B';
+        break;
+      case '\u0961':
+        s[i] = '\u090C';
+        break;
+      case '\u0910':
+        s[i] = '\u090F';
+        break;
+      case '\u0914':
+        s[i] = '\u0913';
+        break;
+        // long -> short dep. vowels
+      case '\u0940':
+        s[i] = '\u093F';
+        break;
+      case '\u0942':
+        s[i] = '\u0941';
+        break;
+      case '\u0944':
+        s[i] = '\u0943';
+        break;
+      case '\u0963':
+        s[i] = '\u0962';
+        break;
+      case '\u0948':
+        s[i] = '\u0947';
+        break;
+      case '\u094C':
+        s[i] = '\u094B';
+        break;
+      default:
+        break;
+      }
+    }
+
+    return len;
+  }
+
+  /**
+   * Delete a character in-place
+   * 
+   * @param s Input Buffer
+   * @param pos Position of character to delete
+   * @param len length of input buffer
+   * @return length of input buffer after deletion
+   */
+  protected int delete(char s[], int pos, int len) {
+    if (pos < len)
+      System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
+    
+    return len - 1;
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemFilter.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemFilter.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemFilter.java Thu Feb  4 12:41:56 2010
@@ -0,0 +1,49 @@
+package org.apache.lucene.analysis.hi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link HindiStemmer} to stem Hindi words.
+ */
+public final class HindiStemFilter extends TokenFilter {
+  private final TermAttribute termAtt = addAttribute(TermAttribute.class);
+  private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
+  private final HindiStemmer stemmer = new HindiStemmer();
+  
+  protected HindiStemFilter(TokenStream input) {
+    super(input);
+  }
+  
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAtt.isKeyword())
+        termAtt.setTermLength(stemmer.stem(termAtt.termBuffer(), termAtt.termLength()));
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemmer.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemmer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemmer.java Thu Feb  4 12:41:56 2010
@@ -0,0 +1,130 @@
+package org.apache.lucene.analysis.hi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Light Stemmer for Hindi.
+ * <p>
+ * Implements the algorithm specified in:
+ * <i>A Lightweight Stemmer for Hindi</i>
+ * Ananthakrishnan Ramanathan and Durgesh D Rao.
+ * http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf
+ * </p>
+ */
+public class HindiStemmer {
+  public int stem(char buffer[], int len) {
+    // 5
+    if ((len > 6) && (endsWith(buffer, len, "à¤¾à¤à¤à¤à¥")
+        || endsWith(buffer, len, "à¤¾à¤à¤à¤à¥")
+        || endsWith(buffer, len, "à¤¾à¤à¤à¤à¥")
+        || endsWith(buffer, len, "à¤¾à¤à¤à¤à¤¾")
+        || endsWith(buffer, len, "à¤¾à¤à¤¯à¤¾à¤")
+        || endsWith(buffer, len, "à¤¾à¤à¤¯à¥à¤")
+        || endsWith(buffer, len, "à¤¾à¤à¤¯à¤¾à¤")
+      ))
+      return len - 5;
+    
+    // 4
+    if ((len > 5) && (endsWith(buffer, len, "à¤¾à¤à¤à¥")
+        || endsWith(buffer, len, "à¤¾à¤à¤à¤¾")
+        || endsWith(buffer, len, "à¤¾à¤à¤à¥")
+        || endsWith(buffer, len, "à¤¾à¤à¤à¥")
+        || endsWith(buffer, len, "à¤à¤à¤à¥")
+        || endsWith(buffer, len, "à¥à¤à¤à¥")
+        || endsWith(buffer, len, "à¤à¤à¤à¥")
+        || endsWith(buffer, len, "à¥à¤à¤à¥")
+        || endsWith(buffer, len, "à¥à¤à¤à¥")
+        || endsWith(buffer, len, "à¥à¤à¤à¤¾")
+        || endsWith(buffer, len, "à¤¾à¤¤à¥à¤")
+        || endsWith(buffer, len, "à¤¨à¤¾à¤à¤")
+        || endsWith(buffer, len, "à¤¨à¤¾à¤à¤")
+        || endsWith(buffer, len, "à¤¤à¤¾à¤à¤")
+        || endsWith(buffer, len, "à¤¤à¤¾à¤à¤")
+        || endsWith(buffer, len, "à¤¿à¤¯à¤¾à¤")
+        || endsWith(buffer, len, "à¤¿à¤¯à¥à¤")
+        || endsWith(buffer, len, "à¤¿à¤¯à¤¾à¤")
+        ))
+      return len - 4;
+    
+    // 3
+    if ((len > 4) && (endsWith(buffer, len, "à¤¾à¤à¤°")
+        || endsWith(buffer, len, "à¤¾à¤à¤")
+        || endsWith(buffer, len, "à¤¾à¤à¤")
+        || endsWith(buffer, len, "à¤¾à¤¯à¤¾")
+        || endsWith(buffer, len, "à¥à¤à¥")
+        || endsWith(buffer, len, "à¥à¤à¤¾")
+        || endsWith(buffer, len, "à¥à¤à¥")
+        || endsWith(buffer, len, "à¥à¤à¥")
+        || endsWith(buffer, len, "à¤¾à¤¨à¥")
+        || endsWith(buffer, len, "à¤¾à¤¨à¤¾")
+        || endsWith(buffer, len, "à¤¾à¤¤à¥")
+        || endsWith(buffer, len, "à¤¾à¤¤à¥")
+        || endsWith(buffer, len, "à¤¾à¤¤à¤¾")
+        || endsWith(buffer, len, "à¤¤à¥à¤")
+        || endsWith(buffer, len, "à¤¾à¤à¤")
+        || endsWith(buffer, len, "à¤¾à¤à¤")
+        || endsWith(buffer, len, "à¥à¤à¤")
+        || endsWith(buffer, len, "à¥à¤à¤")
+        || endsWith(buffer, len, "à¥à¤à¤")
+        ))
+      return len - 3;
+    
+    // 2
+    if ((len > 3) && (endsWith(buffer, len, "à¤à¤°")
+        || endsWith(buffer, len, "à¤¾à¤")
+        || endsWith(buffer, len, "à¤¿à¤")
+        || endsWith(buffer, len, "à¤¾à¤")
+        || endsWith(buffer, len, "à¤¾à¤")
+        || endsWith(buffer, len, "à¤¨à¥")
+        || endsWith(buffer, len, "à¤¨à¥")
+        || endsWith(buffer, len, "à¤¨à¤¾")
+        || endsWith(buffer, len, "à¤¤à¥")
+        || endsWith(buffer, len, "à¥à¤")
+        || endsWith(buffer, len, "à¤¤à¥")
+        || endsWith(buffer, len, "à¤¤à¤¾")
+        || endsWith(buffer, len, "à¤¾à¤")
+        || endsWith(buffer, len, "à¤¾à¤")
+        || endsWith(buffer, len, "à¥à¤")
+        || endsWith(buffer, len, "à¥à¤")
+        ))
+      return len - 2;
+    
+    // 1
+    if ((len > 2) && (endsWith(buffer, len, "à¥")
+        || endsWith(buffer, len, "à¥")
+        || endsWith(buffer, len, "à¥")
+        || endsWith(buffer, len, "à¥")
+        || endsWith(buffer, len, "à¥")
+        || endsWith(buffer, len, "à¤¿")
+        || endsWith(buffer, len, "à¤¾")
+       ))
+      return len - 1;
+    return len;
+  }
+  
+  private boolean endsWith(final char s[], final int len, final String suffix) {
+    final int suffixLen = suffix.length();
+    if (suffixLen > len)
+      return false;
+    for (int i = suffixLen - 1; i >= 0; i--)
+      if (s[len -(suffixLen - i)] != suffix.charAt(i))
+        return false;
+    
+    return true;
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemmer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/package.html?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/package.html (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/package.html Thu Feb  4 12:41:56 2010
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Hindi.
+</body>
+</html>

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilter.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilter.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilter.java Thu Feb  4 12:41:56 2010
@@ -0,0 +1,47 @@
+package org.apache.lucene.analysis.in;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link IndicNormalizer} to normalize text
+ * in Indian Languages.
+ */
+public final class IndicNormalizationFilter extends TokenFilter {
+  private final TermAttribute termAtt = addAttribute(TermAttribute.class);
+  private final IndicNormalizer normalizer = new IndicNormalizer();
+  
+  public IndicNormalizationFilter(TokenStream input) {
+    super(input);
+  }
+  
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      termAtt.setTermLength(normalizer.normalize(termAtt.termBuffer(), termAtt.termLength()));
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java Thu Feb  4 12:41:56 2010
@@ -0,0 +1,303 @@
+package org.apache.lucene.analysis.in;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.BitSet;
+import java.util.IdentityHashMap;
+import static java.lang.Character.UnicodeBlock.*;
+
+/**
+ * Normalizes the Unicode representation of text in Indian languages.
+ * <p>
+ * Follows guidelines from Unicode 5.2, chapter 6, South Asian Scripts I
+ * and graphical decompositions from http://ldc.upenn.edu/myl/IndianScriptsUnicode.html
+ * </p>
+ */
+public class IndicNormalizer {
+  
+  private static class ScriptData {
+    final int flag;
+    final int base;
+    BitSet decompMask;
+    
+    ScriptData(int flag, int base) {
+      this.flag = flag;
+      this.base = base;
+    }
+  }
+  
+  private static final IdentityHashMap<Character.UnicodeBlock,ScriptData> scripts = 
+    new IdentityHashMap<Character.UnicodeBlock,ScriptData>(9);
+  
+  private static int flag(Character.UnicodeBlock ub) {
+    return scripts.get(ub).flag;
+  }
+  
+  static {
+    scripts.put(DEVANAGARI, new ScriptData(1,   0x0900));
+    scripts.put(BENGALI,    new ScriptData(2,   0x0980));
+    scripts.put(GURMUKHI,   new ScriptData(4,   0x0A00));
+    scripts.put(GUJARATI,   new ScriptData(8,   0x0A80));
+    scripts.put(ORIYA,      new ScriptData(16,  0x0B00));
+    scripts.put(TAMIL,      new ScriptData(32,  0x0B80));
+    scripts.put(TELUGU,     new ScriptData(64,  0x0C00));
+    scripts.put(KANNADA,    new ScriptData(128, 0x0C80));
+    scripts.put(MALAYALAM,  new ScriptData(256, 0x0D00));
+  }
+
+  /**
+   * Decompositions according to Unicode 5.2, 
+   * and http://ldc.upenn.edu/myl/IndianScriptsUnicode.html
+   * 
+   * Most of these are not handled by unicode normalization anyway.
+   * 
+   * The numbers here represent offsets into the respective codepages,
+   * with -1 representing null and 0xFF representing zero-width joiner.
+   * 
+   * the columns are: ch1, ch2, ch3, res, flags
+   * ch1, ch2, and ch3 are the decomposition
+   * res is the composition, and flags are the scripts to which it applies.
+   */
+  private static final int decompositions[][] = {
+      /* devanagari, gujarati vowel candra O */
+      { 0x05, 0x3E, 0x45, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
+      /* devanagari short O */
+      { 0x05, 0x3E, 0x46, 0x12, flag(DEVANAGARI) }, 
+      /* devanagari, gujarati letter O */
+      { 0x05, 0x3E, 0x47, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
+      /* devanagari letter AI, gujarati letter AU */
+      { 0x05, 0x3E, 0x48, 0x14, flag(DEVANAGARI) | flag(GUJARATI) }, 
+      /* devanagari, bengali, gurmukhi, gujarati, oriya AA */
+      { 0x05, 0x3E,   -1, 0x06, flag(DEVANAGARI) | flag(BENGALI) | flag(GURMUKHI) | flag(GUJARATI) | flag(ORIYA) }, 
+      /* devanagari letter candra A */
+      { 0x05, 0x45,   -1, 0x72, flag(DEVANAGARI) },
+      /* gujarati vowel candra E */
+      { 0x05, 0x45,   -1, 0x0D, flag(GUJARATI) },
+      /* devanagari letter short A */
+      { 0x05, 0x46,   -1, 0x04, flag(DEVANAGARI) },
+      /* gujarati letter E */
+      { 0x05, 0x47,   -1, 0x0F, flag(GUJARATI) }, 
+      /* gurmukhi, gujarati letter AI */
+      { 0x05, 0x48,   -1, 0x10, flag(GURMUKHI) | flag(GUJARATI) }, 
+      /* devanagari, gujarati vowel candra O */
+      { 0x05, 0x49,   -1, 0x11, flag(DEVANAGARI) | flag(GUJARATI) }, 
+      /* devanagari short O */
+      { 0x05, 0x4A,   -1, 0x12, flag(DEVANAGARI) }, 
+      /* devanagari, gujarati letter O */
+      { 0x05, 0x4B,   -1, 0x13, flag(DEVANAGARI) | flag(GUJARATI) }, 
+      /* devanagari letter AI, gurmukhi letter AU, gujarati letter AU */
+      { 0x05, 0x4C,   -1, 0x14, flag(DEVANAGARI) | flag(GURMUKHI) | flag(GUJARATI) }, 
+      /* devanagari, gujarati vowel candra O */
+      { 0x06, 0x45,   -1, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },  
+      /* devanagari short O */
+      { 0x06, 0x46,   -1, 0x12, flag(DEVANAGARI) },
+      /* devanagari, gujarati letter O */
+      { 0x06, 0x47,   -1, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
+      /* devanagari letter AI, gujarati letter AU */
+      { 0x06, 0x48,   -1, 0x14, flag(DEVANAGARI) | flag(GUJARATI) },
+      /* malayalam letter II */
+      { 0x07, 0x57,   -1, 0x08, flag(MALAYALAM) },
+      /* devanagari letter UU */
+      { 0x09, 0x41,   -1, 0x0A, flag(DEVANAGARI) },
+      /* tamil, malayalam letter UU (some styles) */
+      { 0x09, 0x57,   -1, 0x0A, flag(TAMIL) | flag(MALAYALAM) },
+      /* malayalam letter AI */
+      { 0x0E, 0x46,   -1, 0x10, flag(MALAYALAM) },
+      /* devanagari candra E */
+      { 0x0F, 0x45,   -1, 0x0D, flag(DEVANAGARI) }, 
+      /* devanagari short E */
+      { 0x0F, 0x46,   -1, 0x0E, flag(DEVANAGARI) },
+      /* devanagari AI */
+      { 0x0F, 0x47,   -1, 0x10, flag(DEVANAGARI) },
+      /* oriya AI */
+      { 0x0F, 0x57,   -1, 0x10, flag(ORIYA) },
+      /* malayalam letter OO */
+      { 0x12, 0x3E,   -1, 0x13, flag(MALAYALAM) }, 
+      /* telugu, kannada letter AU */
+      { 0x12, 0x4C,   -1, 0x14, flag(TELUGU) | flag(KANNADA) }, 
+      /* telugu letter OO */
+      { 0x12, 0x55,   -1, 0x13, flag(TELUGU) },
+      /* tamil, malayalam letter AU */
+      { 0x12, 0x57,   -1, 0x14, flag(TAMIL) | flag(MALAYALAM) },
+      /* oriya letter AU */
+      { 0x13, 0x57,   -1, 0x14, flag(ORIYA) },
+      /* devanagari qa */
+      { 0x15, 0x3C,   -1, 0x58, flag(DEVANAGARI) },
+      /* devanagari, gurmukhi khha */
+      { 0x16, 0x3C,   -1, 0x59, flag(DEVANAGARI) | flag(GURMUKHI) },
+      /* devanagari, gurmukhi ghha */
+      { 0x17, 0x3C,   -1, 0x5A, flag(DEVANAGARI) | flag(GURMUKHI) },
+      /* devanagari, gurmukhi za */
+      { 0x1C, 0x3C,   -1, 0x5B, flag(DEVANAGARI) | flag(GURMUKHI) },
+      /* devanagari dddha, bengali, oriya rra */
+      { 0x21, 0x3C,   -1, 0x5C, flag(DEVANAGARI) | flag(BENGALI) | flag(ORIYA) },
+      /* devanagari, bengali, oriya rha */
+      { 0x22, 0x3C,   -1, 0x5D, flag(DEVANAGARI) | flag(BENGALI) | flag(ORIYA) },
+      /* malayalam chillu nn */
+      { 0x23, 0x4D, 0xFF, 0x7A, flag(MALAYALAM) },
+      /* bengali khanda ta */
+      { 0x24, 0x4D, 0xFF, 0x4E, flag(BENGALI) },
+      /* devanagari nnna */
+      { 0x28, 0x3C,   -1, 0x29, flag(DEVANAGARI) },
+      /* malayalam chillu n */
+      { 0x28, 0x4D, 0xFF, 0x7B, flag(MALAYALAM) },
+      /* devanagari, gurmukhi fa */
+      { 0x2B, 0x3C,   -1, 0x5E, flag(DEVANAGARI) | flag(GURMUKHI) },
+      /* devanagari, bengali yya */
+      { 0x2F, 0x3C,   -1, 0x5F, flag(DEVANAGARI) | flag(BENGALI) },
+      /* telugu letter vocalic R */
+      { 0x2C, 0x41, 0x41, 0x0B, flag(TELUGU) },
+      /* devanagari rra */
+      { 0x30, 0x3C,   -1, 0x31, flag(DEVANAGARI) },
+      /* malayalam chillu rr */
+      { 0x30, 0x4D, 0xFF, 0x7C, flag(MALAYALAM) },
+      /* malayalam chillu l */
+      { 0x32, 0x4D, 0xFF, 0x7D, flag(MALAYALAM) },
+      /* devanagari llla */
+      { 0x33, 0x3C,   -1, 0x34, flag(DEVANAGARI) },
+      /* malayalam chillu ll */
+      { 0x33, 0x4D, 0xFF, 0x7E, flag(MALAYALAM) },
+      /* telugu letter MA */ 
+      { 0x35, 0x41,   -1, 0x2E, flag(TELUGU) },
+      /* devanagari, gujarati vowel sign candra O */
+      { 0x3E, 0x45,   -1, 0x49, flag(DEVANAGARI) | flag(GUJARATI) },
+      /* devanagari vowel sign short O */
+      { 0x3E, 0x46,   -1, 0x4A, flag(DEVANAGARI) },
+      /* devanagari, gujarati vowel sign O */
+      { 0x3E, 0x47,   -1, 0x4B, flag(DEVANAGARI) | flag(GUJARATI) },
+      /* devanagari, gujarati vowel sign AU */ 
+      { 0x3E, 0x48,   -1, 0x4C, flag(DEVANAGARI) | flag(GUJARATI) },
+      /* kannada vowel sign II */ 
+      { 0x3F, 0x55,   -1, 0x40, flag(KANNADA) },
+      /* gurmukhi vowel sign UU (when stacking) */
+      { 0x41, 0x41,   -1, 0x42, flag(GURMUKHI) },
+      /* tamil, malayalam vowel sign O */
+      { 0x46, 0x3E,   -1, 0x4A, flag(TAMIL) | flag(MALAYALAM) },
+      /* kannada vowel sign OO */
+      { 0x46, 0x42, 0x55, 0x4B, flag(KANNADA) },
+      /* kannada vowel sign O */
+      { 0x46, 0x42,   -1, 0x4A, flag(KANNADA) },
+      /* malayalam vowel sign AI (if reordered twice) */
+      { 0x46, 0x46,   -1, 0x48, flag(MALAYALAM) },
+      /* telugu, kannada vowel sign EE */
+      { 0x46, 0x55,   -1, 0x47, flag(TELUGU) | flag(KANNADA) },
+      /* telugu, kannada vowel sign AI */
+      { 0x46, 0x56,   -1, 0x48, flag(TELUGU) | flag(KANNADA) },
+      /* tamil, malayalam vowel sign AU */
+      { 0x46, 0x57,   -1, 0x4C, flag(TAMIL) | flag(MALAYALAM) },
+      /* bengali, oriya vowel sign O, tamil, malayalam vowel sign OO */
+      { 0x47, 0x3E,   -1, 0x4B, flag(BENGALI) | flag(ORIYA) | flag(TAMIL) | flag(MALAYALAM) },
+      /* bengali, oriya vowel sign AU */
+      { 0x47, 0x57,   -1, 0x4C, flag(BENGALI) | flag(ORIYA) },
+      /* kannada vowel sign OO */   
+      { 0x4A, 0x55,   -1, 0x4B, flag(KANNADA) },
+      /* gurmukhi letter I */
+      { 0x72, 0x3F,   -1, 0x07, flag(GURMUKHI) },
+      /* gurmukhi letter II */
+      { 0x72, 0x40,   -1, 0x08, flag(GURMUKHI) },
+      /* gurmukhi letter EE */
+      { 0x72, 0x47,   -1, 0x0F, flag(GURMUKHI) },
+      /* gurmukhi letter U */
+      { 0x73, 0x41,   -1, 0x09, flag(GURMUKHI) },
+      /* gurmukhi letter UU */
+      { 0x73, 0x42,   -1, 0x0A, flag(GURMUKHI) },
+      /* gurmukhi letter OO */
+      { 0x73, 0x4B,   -1, 0x13, flag(GURMUKHI) },
+  };
+  
+  static {
+    for (ScriptData sd : scripts.values()) {
+      sd.decompMask = new BitSet(0x7F);
+      for (int i = 0; i < decompositions.length; i++) {
+        final int ch = decompositions[i][0];
+        final int flags = decompositions[i][4];
+        if ((flags & sd.flag) != 0)
+          sd.decompMask.set(ch);
+      }
+    }
+  }
+   
+  /**
+   * Normalizes input text, and returns the new length.
+   * The length will always be less than or equal to the existing length.
+   * 
+   * @param text input text
+   * @param len valid length
+   * @return normalized length
+   */
+  public int normalize(char text[], int len) {
+    for (int i = 0; i < len; i++) {
+      final Character.UnicodeBlock block = Character.UnicodeBlock.of(text[i]);
+      final ScriptData sd = scripts.get(block);
+      if (sd != null) {
+        final int ch = text[i] - sd.base;
+        if (sd.decompMask.get(ch))
+          len = compose(ch, block, sd, text, i, len);
+      }
+    }
+    return len;
+  }
+  
+  /**
+   * Compose into standard form any compositions in the decompositions table.
+   */
+  private int compose(int ch0, Character.UnicodeBlock block0, ScriptData sd, 
+      char text[], int pos, int len) {
+    if (pos + 1 >= len) /* need at least 2 chars! */
+      return len;
+    
+    final int ch1 = text[pos + 1] - sd.base;
+    final Character.UnicodeBlock block1 = Character.UnicodeBlock.of(text[pos + 1]);
+    if (block1 != block0) /* needs to be the same writing system */
+      return len;
+    
+    int ch2 = -1;
+
+    if (pos + 2 < len) {
+      ch2 = text[pos + 2] - sd.base;
+      Character.UnicodeBlock block2 = Character.UnicodeBlock.of(text[pos + 2]);
+      if (text[pos + 2] == '\u200D') // ZWJ
+        ch2 = 0xFF;
+      else if (block2 != block1)  // still allow a 2-char match
+        ch2 = -1;
+    }
+
+    for (int i = 0; i < decompositions.length; i++)
+      if (decompositions[i][0] == ch0 && (decompositions[i][4] & sd.flag) != 0) {
+        if (decompositions[i][1] == ch1 && (decompositions[i][2] < 0 || decompositions[i][2] == ch2)) {
+          text[pos] = (char) (sd.base + decompositions[i][3]);
+          len = delete(text, pos + 1, len);
+          if (decompositions[i][2] >= 0)
+            len = delete(text, pos + 1, len);
+          return len;
+        }
+      }
+    
+    return len;
+  }
+  
+  /**
+   * Delete a character in-place
+   */
+  private int delete(char s[], int pos, int len) {
+    if (pos < len) 
+      System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
+    
+    return len - 1;
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java Thu Feb  4 12:41:56 2010
@@ -0,0 +1,50 @@
+package org.apache.lucene.analysis.in;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.CharTokenizer;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.Version;
+
+/**
+ * Simple Tokenizer for text in Indian Languages.
+ */
+public final class IndicTokenizer extends CharTokenizer {
+ 
+  public IndicTokenizer(Version matchVersion, AttributeFactory factory, Reader input) {
+    super(matchVersion, factory, input);
+  }
+
+  public IndicTokenizer(Version matchVersion, AttributeSource source, Reader input) {
+    super(matchVersion, source, input);
+  }
+
+  public IndicTokenizer(Version matchVersion, Reader input) {
+    super(matchVersion, input);
+  }
+
+  @Override
+  protected boolean isTokenChar(int c) {
+    return Character.isLetter(c)
+    || Character.getType(c) == Character.NON_SPACING_MARK
+    || Character.getType(c) == Character.FORMAT
+    || Character.getType(c) == Character.COMBINING_SPACING_MARK;
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/package.html?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/package.html (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/package.html Thu Feb  4 12:41:56 2010
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analysis components for Indian languages.
+</body>
+</html>

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt Thu Feb  4 12:41:56 2010
@@ -0,0 +1,231 @@
+# Also see http://www.opensource.org/licenses/bsd-license.html
+# See http://members.unine.ch/jacques.savoy/clef/index.html.
+# This file was created by Jacques Savoy and is distributed under the BSD license.
+à¤à¤à¤¦à¤°
+à¤à¤¤
+à¤à¤ªà¤¨à¤¾
+à¤à¤ªà¤¨à¥
+à¤à¤ªà¤¨à¥
+à¤à¤à¥
+à¤à¤¦à¤¿
+à¤à¤ª
+à¤à¤¤à¥à¤¯à¤¾à¤¦à¤¿
+à¤à¤¨ 
+à¤à¤¨à¤à¤¾
+à¤à¤¨à¥à¤¹à¥à¤
+à¤à¤¨à¥à¤¹à¥à¤
+à¤à¤¨à¥à¤¹à¥à¤
+à¤à¤¸
+à¤à¤¸à¤à¤¾
+à¤à¤¸à¤à¥
+à¤à¤¸à¤à¥
+à¤à¤¸à¤®à¥à¤
+à¤à¤¸à¥
+à¤à¤¸à¥
+à¤à¤¨
+à¤à¤¨à¤à¤¾
+à¤à¤¨à¤à¥
+à¤à¤¨à¤à¥
+à¤à¤¨à¤à¥
+à¤à¤¨à¥à¤¹à¥à¤
+à¤à¤¨à¥à¤¹à¥à¤
+à¤à¤¨à¥à¤¹à¥à¤
+à¤à¤¸
+à¤à¤¸à¤à¥
+à¤à¤¸à¥
+à¤à¤¸à¥
+à¤à¤
+à¤à¤µà¤
+à¤à¤¸
+à¤à¤¸à¥
+à¤à¤°
+à¤à¤
+à¤à¤°
+à¤à¤°à¤¤à¤¾
+à¤à¤°à¤¤à¥
+à¤à¤°à¤¨à¤¾
+à¤à¤°à¤¨à¥
+à¤à¤°à¥à¤
+à¤à¤¹à¤¤à¥
+à¤à¤¹à¤¾
+à¤à¤¾
+à¤à¤¾à¥à¥
+à¤à¤¿
+à¤à¤¿à¤¤à¤¨à¤¾
+à¤à¤¿à¤¨à¥à¤¹à¥à¤
+à¤à¤¿à¤¨à¥à¤¹à¥à¤
+à¤à¤¿à¤¯à¤¾
+à¤à¤¿à¤°
+à¤à¤¿à¤¸
+à¤à¤¿à¤¸à¥
+à¤à¤¿à¤¸à¥
+à¤à¥
+à¤à¥à¤
+à¤à¥à¤²
+à¤à¥
+à¤à¥
+à¤à¥à¤
+à¤à¥à¤¨
+à¤à¥à¤¨à¤¸à¤¾
+à¤à¤¯à¤¾
+à¤à¤°
+à¤à¤¬
+à¤à¤¹à¤¾à¤
+à¤à¤¾
+à¤à¤¿à¤¤à¤¨à¤¾
+à¤à¤¿à¤¨
+à¤à¤¿à¤¨à¥à¤¹à¥à¤
+à¤à¤¿à¤¨à¥à¤¹à¥à¤
+à¤à¤¿à¤¸
+à¤à¤¿à¤¸à¥
+à¤à¥à¤§à¤°
+à¤à¥à¤¸à¤¾
+à¤à¥à¤¸à¥
+à¤à¥
+à¤¤à¤
+à¤¤à¤¬
+à¤¤à¤°à¤¹
+à¤¤à¤¿à¤¨
+à¤¤à¤¿à¤¨à¥à¤¹à¥à¤
+à¤¤à¤¿à¤¨à¥à¤¹à¥à¤
+à¤¤à¤¿à¤¸
+à¤¤à¤¿à¤¸à¥
+à¤¤à¥
+à¤¥à¤¾
+à¤¥à¥
+à¤¥à¥
+à¤¦à¤¬à¤¾à¤°à¤¾
+à¤¦à¤¿à¤¯à¤¾
+à¤¦à¥à¤¸à¤°à¤¾
+à¤¦à¥à¤¸à¤°à¥
+à¤¦à¥
+à¤¦à¥à¤µà¤¾à¤°à¤¾
+à¤¨
+à¤¨à¤¹à¥à¤
+à¤¨à¤¾
+à¤¨à¤¿à¤¹à¤¾à¤¯à¤¤
+à¤¨à¥à¤à¥
+à¤¨à¥
+à¤ªà¤°
+à¤ªà¤°  
+à¤ªà¤¹à¤²à¥
+à¤ªà¥à¤°à¤¾
+à¤ªà¥
+à¤«à¤¿à¤°
+à¤¬à¤¨à¥
+à¤¬à¤¹à¥
+à¤¬à¤¹à¥à¤¤
+à¤¬à¤¾à¤¦
+à¤¬à¤¾à¤²à¤¾
+à¤¬à¤¿à¤²à¤à¥à¤²
+à¤à¥
+à¤à¥à¤¤à¤°
+à¤®à¤à¤°
+à¤®à¤¾à¤¨à¥
+à¤®à¥
+à¤®à¥à¤
+à¤¯à¤¦à¤¿
+à¤¯à¤¹
+à¤¯à¤¹à¤¾à¤
+à¤¯à¤¹à¥
+à¤¯à¤¾
+à¤¯à¤¿à¤¹ 
+à¤¯à¥
+à¤°à¤à¥à¤
+à¤°à¤¹à¤¾
+à¤°à¤¹à¥
+à¤±à¥à¤µà¤¾à¤¸à¤¾
+à¤²à¤¿à¤
+à¤²à¤¿à¤¯à¥
+à¤²à¥à¤à¤¿à¤¨
+à¤µ
+à¤µà¤°à¥à¤
+à¤µà¤¹
+à¤µà¤¹ 
+à¤µà¤¹à¤¾à¤
+à¤µà¤¹à¥à¤
+à¤µà¤¾à¤²à¥
+à¤µà¥à¤¹ 
+à¤µà¥
+à¤µà¥à¥à¤°à¤¹
+à¤¸à¤à¤
+à¤¸à¤à¤¤à¤¾
+à¤¸à¤à¤¤à¥
+à¤¸à¤¬à¤¸à¥
+à¤¸à¤à¥
+à¤¸à¤¾à¤¥
+à¤¸à¤¾à¤¬à¥à¤¤
+à¤¸à¤¾à¤
+à¤¸à¤¾à¤°à¤¾
+à¤¸à¥
+à¤¸à¥
+à¤¹à¥
+à¤¹à¥à¤
+à¤¹à¥à¤
+à¤¹à¥à¤
+à¤¹à¥
+à¤¹à¥à¤
+à¤¹à¥
+à¤¹à¥à¤¤à¤¾
+à¤¹à¥à¤¤à¥
+à¤¹à¥à¤¤à¥
+à¤¹à¥à¤¨à¤¾
+à¤¹à¥à¤¨à¥
+# additional normalized forms of the above
+à¤à¤ªà¤¨à¤¿
+à¤à¥à¤¸à¥
+à¤¹à¥à¤¤à¤¿
+à¤¸à¤à¤¿
+à¤¤à¤¿à¤à¤¹à¥à¤
+à¤à¤à¤¹à¥à¤
+à¤¦à¤µà¤¾à¤°à¤¾
+à¤à¤¸à¤¿
+à¤à¤¿à¤à¤¹à¥à¤
+à¤¥à¤¿
+à¤à¤à¤¹à¥à¤
+à¤à¤°
+à¤à¤¿à¤à¤¹à¥à¤
+à¤µà¤¹à¤¿à¤
+à¤à¤à¤¿
+à¤¬à¤¨à¤¿
+à¤¹à¤¿
+à¤à¤à¤¹à¤¿à¤
+à¤à¤à¤¹à¥à¤
+à¤¹à¥à¤
+à¤µà¤à¥à¤°à¤¹
+à¤à¤¸à¥
+à¤°à¤µà¤¾à¤¸à¤¾
+à¤à¥à¤¨
+à¤¨à¤¿à¤à¥
+à¤à¤¾à¤«à¤¿
+à¤à¤¸à¤¿
+à¤ªà¥à¤°à¤¾
+à¤à¤¿à¤¤à¤°
+à¤¹à¥
+à¤¬à¤¹à¤¿
+à¤µà¤¹à¤¾à¤
+à¤à¥à¤
+à¤¯à¤¹à¤¾à¤
+à¤à¤¿à¤à¤¹à¥à¤
+à¤¤à¤¿à¤à¤¹à¥à¤
+à¤à¤¿à¤¸à¤¿
+à¤à¤
+à¤¯à¤¹à¤¿
+à¤à¤à¤¹à¤¿à¤
+à¤à¤¿à¤§à¤°
+à¤à¤à¤¹à¥à¤
+à¤à¤¦à¤¿
+à¤à¤¤à¤¯à¤¾à¤¦à¤¿
+à¤¹à¥à¤
+à¤à¥à¤¨à¤¸à¤¾
+à¤à¤¸à¤à¤¿
+à¤¦à¥à¤¸à¤°à¥
+à¤à¤¹à¤¾à¤
+à¤à¤ª
+à¤à¤¿à¤à¤¹à¥à¤
+à¤à¤¨à¤à¤¿
+à¤à¤¿
+à¤µà¤°à¤
+à¤¹à¥à¤
+à¤à¥à¤¸à¤¾
+à¤¨à¤¹à¤¿à¤

Propchange: lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java Thu Feb  4 12:41:56 2010
@@ -0,0 +1,51 @@
+package org.apache.lucene.analysis.hi;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests the HindiAnalyzer
+ */
+public class TestHindiAnalyzer extends BaseTokenStreamTestCase {
+  /** This test fails with NPE when the 
+   * stopwords file is missing in classpath */
+  public void testResourcesAvailable() {
+    new HindiAnalyzer(Version.LUCENE_CURRENT);
+  }
+  
+  public void testBasics() throws Exception {
+    Analyzer a = new HindiAnalyzer(Version.LUCENE_CURRENT);
+    // two ways to write 'hindi' itself.
+    checkOneTermReuse(a, "à¤¹à¤¿à¤¨à¥à¤¦à¥", "à¤¹à¤¿à¤à¤¦");
+    checkOneTermReuse(a, "à¤¹à¤¿à¤à¤¦à¥", "à¤¹à¤¿à¤à¤¦");
+  }
+  
+  public void testExclusionSet() throws Exception {
+    Set<String> exclusionSet = new HashSet<String>();
+    exclusionSet.add("à¤¹à¤¿à¤à¤¦à¥");
+    Analyzer a = new HindiAnalyzer(Version.LUCENE_CURRENT, 
+        HindiAnalyzer.getDefaultStopSet(), exclusionSet);
+    checkOneTermReuse(a, "à¤¹à¤¿à¤à¤¦à¥", "à¤¹à¤¿à¤à¤¦à¥");
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiNormalizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiNormalizer.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiNormalizer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiNormalizer.java Thu Feb  4 12:41:56 2010
@@ -0,0 +1,68 @@
+package org.apache.lucene.analysis.hi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.util.Version;
+
+/**
+ * Test HindiNormalizer
+ */
+public class TestHindiNormalizer extends BaseTokenStreamTestCase {
+  /**
+   * Test some basic normalization, with an example from the paper.
+   */
+  public void testBasics() throws IOException {
+    check("à¤à¤à¤à¤°à¥à¤à¤¼à¥", "à¤à¤à¤à¤°à¥à¤à¤¿");
+    check("à¤à¤à¤à¤°à¥à¤à¥", "à¤à¤à¤à¤°à¥à¤à¤¿");
+    check("à¤à¤à¤à¥à¤°à¥à¤à¤¼à¥", "à¤à¤à¤à¤°à¥à¤à¤¿");
+    check("à¤à¤à¤à¥à¤°à¥à¤à¥", "à¤à¤à¤à¤°à¥à¤à¤¿");
+    check("à¤à¤à¤à¤°à¥à¤à¤¼à¥", "à¤à¤à¤à¤°à¥à¤à¤¿");
+    check("à¤à¤à¤à¤°à¥à¤à¥", "à¤à¤à¤à¤°à¥à¤à¤¿");
+    check("à¤à¤à¤à¥à¤°à¥à¤à¤¼à¥", "à¤à¤à¤à¤°à¥à¤à¤¿");
+    check("à¤à¤à¤à¥à¤°à¥à¤à¥", "à¤à¤à¤à¤°à¥à¤à¤¿");
+  }
+  
+  public void testDecompositions() throws IOException {
+    // removing nukta dot
+    check("à¥à¤¿à¤¤à¤¾à¤¬", "à¤à¤¿à¤¤à¤¾à¤¬");
+    check("à¥à¤°à¥à¥", "à¤«à¤°à¤");
+    check("à¥à¤°à¥à¥", "à¤à¤°à¤");
+    // some other composed nukta forms
+    check("à¤±à¤´à¥à¥à¥à¥à¥", "à¤°à¤³à¤à¤à¤¡à¤¢à¤¯");
+    // removal of format (ZWJ/ZWNJ)
+    check("à¤¶à¤¾à¤°à¥âà¤®à¤¾", "à¤¶à¤¾à¤°à¤®à¤¾");
+    check("à¤¶à¤¾à¤°à¥âà¤®à¤¾", "à¤¶à¤¾à¤°à¤®à¤¾");
+    // removal of chandra
+    check("à¥à¥à¥à¥à¤à¤à¤à¤\u0972", "à¥à¥à¥à¥à¤à¤à¤à¤à¤");
+    // vowel shortening
+    check("à¤à¤à¤à¥ à¥¡à¤à¤à¥à¥à¥à¥£à¥à¥", "à¤à¤à¤à¤à¤à¤à¤à¤¿à¥à¥à¥¢à¥à¥");
+  }
+  private void check(String input, String output) throws IOException {
+    Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT, 
+        new StringReader(input));
+    TokenFilter tf = new HindiNormalizationFilter(tokenizer);
+    assertTokenStreamContents(tf, new String[] { output });
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiNormalizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiStemmer.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiStemmer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiStemmer.java Thu Feb  4 12:41:56 2010
@@ -0,0 +1,90 @@
+package org.apache.lucene.analysis.hi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.util.Version;
+
+/**
+ * Test HindiStemmer
+ */
+public class TestHindiStemmer extends BaseTokenStreamTestCase {
+  /**
+   * Test masc noun inflections
+   */
+  public void testMasculineNouns() throws IOException {
+    check("à¤²à¤¡à¤à¤¾", "à¤²à¤¡à¤");
+    check("à¤²à¤¡à¤à¥", "à¤²à¤¡à¤");
+    check("à¤²à¤¡à¤à¥à¤", "à¤²à¤¡à¤");
+    
+    check("à¤à¥à¤°à¥", "à¤à¥à¤°");
+    check("à¤à¥à¤°à¥à¤à¤", "à¤à¥à¤°");
+    
+    check("à¤¦à¥à¤¸à¥à¤¤", "à¤¦à¥à¤¸à¥à¤¤");
+    check("à¤¦à¥à¤¸à¥à¤¤à¥à¤", "à¤¦à¥à¤¸à¥à¤¤");
+  }
+  
+  /**
+   * Test feminine noun inflections
+   */
+  public void testFeminineNouns() throws IOException {
+    check("à¤²à¤¡à¤à¥", "à¤²à¤¡à¤");
+    check("à¤²à¤¡à¤à¤¿à¤¯à¥à¤", "à¤²à¤¡à¤");
+    
+    check("à¤à¤¿à¤¤à¤¾à¤¬", "à¤à¤¿à¤¤à¤¾à¤¬");
+    check("à¤à¤¿à¤¤à¤¾à¤¬à¥à¤", "à¤à¤¿à¤¤à¤¾à¤¬");
+    check("à¤à¤¿à¤¤à¤¾à¤¬à¥à¤", "à¤à¤¿à¤¤à¤¾à¤¬");
+    
+    check("à¤à¤§à¥à¤¯à¤¾à¤ªà¥à¤à¤¾", "à¤à¤§à¥à¤¯à¤¾à¤ªà¥à¤");
+    check("à¤à¤§à¥à¤¯à¤¾à¤ªà¥à¤à¤¾à¤à¤", "à¤à¤§à¥à¤¯à¤¾à¤ªà¥à¤");
+    check("à¤à¤§à¥à¤¯à¤¾à¤ªà¥à¤à¤¾à¤à¤", "à¤à¤§à¥à¤¯à¤¾à¤ªà¥à¤");
+  }
+  
+  /**
+   * Test some verb forms
+   */
+  public void testVerbs() throws IOException {
+    check("à¤à¤¾à¤¨à¤¾", "à¤à¤¾");
+    check("à¤à¤¾à¤¤à¤¾", "à¤à¤¾");
+    check("à¤à¤¾à¤¤à¥", "à¤à¤¾");
+    check("à¤à¤¾", "à¤à¤¾");
+  }
+  
+  /**
+   * From the paper: since the suffix list for verbs includes AI, awA and anI,
+   * additional suffixes had to be added to the list for noun/adjectives
+   * ending with these endings.
+   */
+  public void testExceptions() throws IOException {
+    check("à¤à¤ à¤¿à¤¨à¤¾à¤à¤¯à¤¾à¤", "à¤à¤ à¤¿à¤¨");
+    check("à¤à¤ à¤¿à¤¨", "à¤à¤ à¤¿à¤¨");
+  }
+  
+  private void check(String input, String output) throws IOException {
+    Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT, 
+        new StringReader(input));
+    TokenFilter tf = new HindiStemFilter(tokenizer);
+    assertTokenStreamContents(tf, new String[] { output });
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiStemmer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicNormalizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicNormalizer.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicNormalizer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicNormalizer.java Thu Feb  4 12:41:56 2010
@@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.in;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.util.Version;
+
+/**
+ * Test IndicNormalizer
+ */
+public class TestIndicNormalizer extends BaseTokenStreamTestCase {
+  /**
+   * Test some basic normalization
+   */
+  public void testBasics() throws IOException {
+    check("à¤à¤¾à¥à¤à¤¾à¥", "à¤à¤");
+    check("à¤à¤¾à¥à¤à¤¾à¥", "à¤à¤");
+    check("à¤à¤¾à¥à¤à¤¾à¥", "à¤à¤");
+    check("à¤à¤¾à¥à¤à¤¾à¥", "à¤à¤");
+    check("à¤à¤¾à¤à¤¾", "à¤à¤");
+    check("à¤à¤¾à¥à¤°", "à¤à¤°");
+    // khanda-ta
+    check("à¦¤à§â", "à§");
+  }
+  
+  private void check(String input, String output) throws IOException {
+    Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT, 
+        new StringReader(input));
+    TokenFilter tf = new IndicNormalizationFilter(tokenizer);
+    assertTokenStreamContents(tf, new String[] { output });
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicNormalizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicTokenizer.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicTokenizer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicTokenizer.java Thu Feb  4 12:41:56 2010
@@ -0,0 +1,45 @@
+package org.apache.lucene.analysis.in;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.Version;
+
+/**
+ * Test IndicTokenizer
+ */
+public class TestIndicTokenizer extends BaseTokenStreamTestCase {
+  /** Test tokenizing Indic vowels, signs, and punctuation */
+  public void testBasics() throws IOException {
+    TokenStream ts = new IndicTokenizer(Version.LUCENE_CURRENT,
+        new StringReader("à¤®à¥à¤à¥ à¤¹à¤¿à¤à¤¦à¥ à¤à¤¾ à¤à¤° à¤à¤à¥à¤¯à¤¾à¤¸ à¤à¤°à¤¨à¤¾ à¤¹à¥à¤à¤¾ à¥¤"));
+    assertTokenStreamContents(ts,
+        new String[] { "à¤®à¥à¤à¥", "à¤¹à¤¿à¤à¤¦à¥", "à¤à¤¾", "à¤à¤°", "à¤à¤à¥à¤¯à¤¾à¤¸", "à¤à¤°à¤¨à¤¾", "à¤¹à¥à¤à¤¾" });
+  }
+  
+  /** Test that words with format chars such as ZWJ are kept */
+  public void testFormat() throws Exception {
+    TokenStream ts = new IndicTokenizer(Version.LUCENE_CURRENT,
+        new StringReader("à¤¶à¤¾à¤°à¥âà¤®à¤¾ à¤¶à¤¾à¤°à¥âà¤®à¤¾"));
+    assertTokenStreamContents(ts, new String[] { "à¤¶à¤¾à¤°à¥âà¤®à¤¾", "à¤¶à¤¾à¤°à¥âà¤®à¤¾" });
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicTokenizer.java
------------------------------------------------------------------------------
    svn:eol-style = native