You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2010/02/04 13:42:03 UTC
svn commit: r906468 - in /lucene/java/trunk: ./ contrib/
contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/
contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/
contrib/analyzers/common/src/resources/org/apache/lucene/analysis...
Author: rmuir
Date: Thu Feb 4 12:41:56 2010
New Revision: 906468
URL: http://svn.apache.org/viewvc?rev=906468&view=rev
Log:
LUCENE-2234: Hindi Analyzer
Added:
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizationFilter.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemFilter.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemmer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/package.html (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilter.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/package.html (with props)
lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/
lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt (with props)
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiNormalizer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiStemmer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicNormalizer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicTokenizer.java (with props)
Modified:
lucene/java/trunk/NOTICE.txt
lucene/java/trunk/contrib/CHANGES.txt
Modified: lucene/java/trunk/NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/NOTICE.txt?rev=906468&r1=906467&r2=906468&view=diff
==============================================================================
--- lucene/java/trunk/NOTICE.txt (original)
+++ lucene/java/trunk/NOTICE.txt Thu Feb 4 12:41:56 2010
@@ -28,6 +28,11 @@
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt.
See http://members.unine.ch/jacques.savoy/clef/index.html.
+The Hindi analyzer (contrib/analyzers) comes with a default
+stopword list that is BSD-licensed created by Jacques Savoy. The file resides in
+contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt.
+See http://members.unine.ch/jacques.savoy/clef/index.html.
+
Includes lib/servlet-api-2.4.jar from Apache Tomcat
The SmartChineseAnalyzer source code (under contrib/analyzers) was
Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=906468&r1=906467&r2=906468&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Thu Feb 4 12:41:56 2010
@@ -103,6 +103,8 @@
character is now configurable. Its also up to 20% faster.
(Steven Rowe via Robert Muir)
+ * LUCENE-2234: Add a Hindi analyzer. (Robert Muir)
+
Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java Thu Feb 4 12:41:56 2010
@@ -0,0 +1,132 @@
+package org.apache.lucene.analysis.hi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.in.IndicNormalizationFilter;
+import org.apache.lucene.analysis.in.IndicTokenizer;
+import org.apache.lucene.util.Version;
+
+/**
+ * Analyzer for Hindi.
+ */
+public final class HindiAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /**
+ * File containing default Hindi stopwords.
+ *
+ * Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html
+ * The stopword list is BSD-Licensed.
+ */
+ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+ private static final String STOPWORDS_COMMENT = "#";
+
+ /**
+ * Returns an unmodifiable instance of the default stop-words set.
+ * @return an unmodifiable instance of the default stop-words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = loadStopwordSet(false, HindiAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the given stop words
+ *
+ * @param version lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a stemming exclusion set
+ */
+ public HindiAnalyzer(Version version, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(version, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(
+ CharArraySet.copy(matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Builds an analyzer with the given stop words
+ *
+ * @param version lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public HindiAnalyzer(Version version, Set<?> stopwords) {
+ this(version, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the default stop words:
+ * {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public HindiAnalyzer(Version version) {
+ this(version, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
+ * {@link Reader}.
+ *
+ * @return {@link TokenStreamComponents} built from a {@link IndicTokenizer}
+ * filtered with {@link LowerCaseFilter},
+ * {@link IndicNormalizationFilter},
+ * {@link HindiNormalizationFilter},
+ * {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided,
+ * {@link HindiStemFilter}, and Hindi Stop words
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new IndicTokenizer(matchVersion, reader);
+ TokenStream result = new LowerCaseFilter(matchVersion, source);
+ if (!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new IndicNormalizationFilter(result);
+ result = new HindiNormalizationFilter(result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ result = new HindiStemFilter(result);
+ return new TokenStreamComponents(source, result);
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizationFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizationFilter.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizationFilter.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizationFilter.java Thu Feb 4 12:41:56 2010
@@ -0,0 +1,59 @@
+package org.apache.lucene.analysis.hi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link HindiNormalizer} to normalize the
+ * orthography.
+ * <p>
+ * In some cases the normalization may cause unrelated terms to conflate, so
+ * to prevent terms from being normalized use an instance of
+ * {@link KeywordMarkerTokenFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ * @see HindiNormalizer
+ */
+public final class HindiNormalizationFilter extends TokenFilter {
+
+ private final HindiNormalizer normalizer = new HindiNormalizer();
+ private final TermAttribute termAtt = addAttribute(TermAttribute.class);
+ private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
+
+ public HindiNormalizationFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (!keywordAtt.isKeyword())
+ termAtt.setTermLength(normalizer.normalize(termAtt.termBuffer(),
+ termAtt.termLength()));
+ return true;
+ }
+ return false;
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizationFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java Thu Feb 4 12:41:56 2010
@@ -0,0 +1,194 @@
+package org.apache.lucene.analysis.hi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Normalizer for Hindi.
+ * <p>
+ * Normalizes text to remove some differences in spelling variations.
+ * <p>
+ * Implements the Hindi-language specific algorithm specified in:
+ * <i>Word normalization in Indian languages</i>
+ * Prasad Pingali and Vasudeva Varma.
+ * http://web2py.iiit.ac.in/publications/default/download/inproceedings.pdf.3fe5b38c-02ee-41ce-9a8f-3e745670be32.pdf
+ * <p>
+ * with the following additions from <i>Hindi CLIR in Thirty Days</i>
+ * Leah S. Larkey, Margaret E. Connell, and Nasreen AbdulJaleel.
+ * http://maroo.cs.umass.edu/pub/web/getpdf.php?id=454:
+ * <ul>
+ * <li>Internal Zero-width joiner and Zero-width non-joiners are removed
+ * <li>In addition to chandrabindu, NA+halant is normalized to anusvara
+ * </ul>
+ *
+ */
+public class HindiNormalizer {
+ /**
+ * Normalize an input buffer of Hindi text
+ *
+ * @param s input buffer
+ * @param len length of input buffer
+ * @return length of input buffer after normalization
+ */
+ public int normalize(char s[], int len) {
+
+ for (int i = 0; i < len; i++) {
+ switch (s[i]) {
+ // dead n -> bindu
+ case '\u0928':
+ if (i + 1 < len && s[i + 1] == '\u094D') {
+ s[i] = '\u0902';
+ len = delete(s, i + 1, len);
+ }
+ break;
+ // candrabindu -> bindu
+ case '\u0901':
+ s[i] = '\u0902';
+ break;
+ // nukta deletions
+ case '\u093C':
+ len = delete(s, i, len);
+ i--;
+ break;
+ case '\u0929':
+ s[i] = '\u0928';
+ break;
+ case '\u0931':
+ s[i] = '\u0930';
+ break;
+ case '\u0934':
+ s[i] = '\u0933';
+ break;
+ case '\u0958':
+ s[i] = '\u0915';
+ break;
+ case '\u0959':
+ s[i] = '\u0916';
+ break;
+ case '\u095A':
+ s[i] = '\u0917';
+ break;
+ case '\u095B':
+ s[i] = '\u091C';
+ break;
+ case '\u095C':
+ s[i] = '\u0921';
+ break;
+ case '\u095D':
+ s[i] = '\u0922';
+ break;
+ case '\u095E':
+ s[i] = '\u092B';
+ break;
+ case '\u095F':
+ s[i] = '\u092F';
+ break;
+ // zwj/zwnj -> delete
+ case '\u200D':
+ case '\u200C':
+ len = delete(s, i, len);
+ i--;
+ break;
+ // virama -> delete
+ case '\u094D':
+ len = delete(s, i, len);
+ i--;
+ break;
+ // chandra/short -> replace
+ case '\u0945':
+ case '\u0946':
+ s[i] = '\u0947';
+ break;
+ case '\u0949':
+ case '\u094A':
+ s[i] = '\u094B';
+ break;
+ case '\u090D':
+ case '\u090E':
+ s[i] = '\u090F';
+ break;
+ case '\u0911':
+ case '\u0912':
+ s[i] = '\u0913';
+ break;
+ case '\u0972':
+ s[i] = '\u0905';
+ break;
+ // long -> short ind. vowels
+ case '\u0906':
+ s[i] = '\u0905';
+ break;
+ case '\u0908':
+ s[i] = '\u0907';
+ break;
+ case '\u090A':
+ s[i] = '\u0909';
+ break;
+ case '\u0960':
+ s[i] = '\u090B';
+ break;
+ case '\u0961':
+ s[i] = '\u090C';
+ break;
+ case '\u0910':
+ s[i] = '\u090F';
+ break;
+ case '\u0914':
+ s[i] = '\u0913';
+ break;
+ // long -> short dep. vowels
+ case '\u0940':
+ s[i] = '\u093F';
+ break;
+ case '\u0942':
+ s[i] = '\u0941';
+ break;
+ case '\u0944':
+ s[i] = '\u0943';
+ break;
+ case '\u0963':
+ s[i] = '\u0962';
+ break;
+ case '\u0948':
+ s[i] = '\u0947';
+ break;
+ case '\u094C':
+ s[i] = '\u094B';
+ break;
+ default:
+ break;
+ }
+ }
+
+ return len;
+ }
+
+ /**
+ * Delete a character in-place
+ *
+ * @param s Input Buffer
+ * @param pos Position of character to delete
+ * @param len length of input buffer
+ * @return length of input buffer after deletion
+ */
+ protected int delete(char s[], int pos, int len) {
+ if (pos < len)
+ System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
+
+ return len - 1;
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemFilter.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemFilter.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemFilter.java Thu Feb 4 12:41:56 2010
@@ -0,0 +1,49 @@
+package org.apache.lucene.analysis.hi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link HindiStemmer} to stem Hindi words.
+ */
+public final class HindiStemFilter extends TokenFilter {
+ private final TermAttribute termAtt = addAttribute(TermAttribute.class);
+ private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
+ private final HindiStemmer stemmer = new HindiStemmer();
+
+ protected HindiStemFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (!keywordAtt.isKeyword())
+ termAtt.setTermLength(stemmer.stem(termAtt.termBuffer(), termAtt.termLength()));
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemmer.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemmer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemmer.java Thu Feb 4 12:41:56 2010
@@ -0,0 +1,130 @@
+package org.apache.lucene.analysis.hi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Light Stemmer for Hindi.
+ * <p>
+ * Implements the algorithm specified in:
+ * <i>A Lightweight Stemmer for Hindi</i>
+ * Ananthakrishnan Ramanathan and Durgesh D Rao.
+ * http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf
+ * </p>
+ */
+public class HindiStemmer {
+ public int stem(char buffer[], int len) {
+ // 5
+ if ((len > 6) && (endsWith(buffer, len, "ाà¤à¤à¤à¥")
+ || endsWith(buffer, len, "ाà¤à¤à¤à¥")
+ || endsWith(buffer, len, "ाà¤à¤à¤à¥")
+ || endsWith(buffer, len, "ाà¤à¤à¤à¤¾")
+ || endsWith(buffer, len, "ाà¤à¤¯à¤¾à¤")
+ || endsWith(buffer, len, "ाà¤à¤¯à¥à¤")
+ || endsWith(buffer, len, "ाà¤à¤¯à¤¾à¤")
+ ))
+ return len - 5;
+
+ // 4
+ if ((len > 5) && (endsWith(buffer, len, "ाà¤à¤à¥")
+ || endsWith(buffer, len, "ाà¤à¤à¤¾")
+ || endsWith(buffer, len, "ाà¤à¤à¥")
+ || endsWith(buffer, len, "ाà¤à¤à¥")
+ || endsWith(buffer, len, "à¤à¤à¤à¥")
+ || endsWith(buffer, len, "à¥à¤à¤à¥")
+ || endsWith(buffer, len, "à¤à¤à¤à¥")
+ || endsWith(buffer, len, "à¥à¤à¤à¥")
+ || endsWith(buffer, len, "à¥à¤à¤à¥")
+ || endsWith(buffer, len, "à¥à¤à¤à¤¾")
+ || endsWith(buffer, len, "ातà¥à¤")
+ || endsWith(buffer, len, "नाà¤à¤")
+ || endsWith(buffer, len, "नाà¤à¤")
+ || endsWith(buffer, len, "ताà¤à¤")
+ || endsWith(buffer, len, "ताà¤à¤")
+ || endsWith(buffer, len, "ियाà¤")
+ || endsWith(buffer, len, "ियà¥à¤")
+ || endsWith(buffer, len, "ियाà¤")
+ ))
+ return len - 4;
+
+ // 3
+ if ((len > 4) && (endsWith(buffer, len, "ाà¤à¤°")
+ || endsWith(buffer, len, "ाà¤à¤")
+ || endsWith(buffer, len, "ाà¤à¤")
+ || endsWith(buffer, len, "ाया")
+ || endsWith(buffer, len, "à¥à¤à¥")
+ || endsWith(buffer, len, "à¥à¤à¤¾")
+ || endsWith(buffer, len, "à¥à¤à¥")
+ || endsWith(buffer, len, "à¥à¤à¥")
+ || endsWith(buffer, len, "ानà¥")
+ || endsWith(buffer, len, "ाना")
+ || endsWith(buffer, len, "ातà¥")
+ || endsWith(buffer, len, "ातà¥")
+ || endsWith(buffer, len, "ाता")
+ || endsWith(buffer, len, "तà¥à¤")
+ || endsWith(buffer, len, "ाà¤à¤")
+ || endsWith(buffer, len, "ाà¤à¤")
+ || endsWith(buffer, len, "à¥à¤à¤")
+ || endsWith(buffer, len, "à¥à¤à¤")
+ || endsWith(buffer, len, "à¥à¤à¤")
+ ))
+ return len - 3;
+
+ // 2
+ if ((len > 3) && (endsWith(buffer, len, "à¤à¤°")
+ || endsWith(buffer, len, "ाà¤")
+ || endsWith(buffer, len, "िà¤")
+ || endsWith(buffer, len, "ाà¤")
+ || endsWith(buffer, len, "ाà¤")
+ || endsWith(buffer, len, "नà¥")
+ || endsWith(buffer, len, "नà¥")
+ || endsWith(buffer, len, "ना")
+ || endsWith(buffer, len, "तà¥")
+ || endsWith(buffer, len, "à¥à¤")
+ || endsWith(buffer, len, "तà¥")
+ || endsWith(buffer, len, "ता")
+ || endsWith(buffer, len, "ाà¤")
+ || endsWith(buffer, len, "ाà¤")
+ || endsWith(buffer, len, "à¥à¤")
+ || endsWith(buffer, len, "à¥à¤")
+ ))
+ return len - 2;
+
+ // 1
+ if ((len > 2) && (endsWith(buffer, len, "à¥")
+ || endsWith(buffer, len, "à¥")
+ || endsWith(buffer, len, "à¥")
+ || endsWith(buffer, len, "à¥")
+ || endsWith(buffer, len, "à¥")
+ || endsWith(buffer, len, "ि")
+ || endsWith(buffer, len, "ा")
+ ))
+ return len - 1;
+ return len;
+ }
+
+ private boolean endsWith(final char s[], final int len, final String suffix) {
+ final int suffixLen = suffix.length();
+ if (suffixLen > len)
+ return false;
+ for (int i = suffixLen - 1; i >= 0; i--)
+ if (s[len -(suffixLen - i)] != suffix.charAt(i))
+ return false;
+
+ return true;
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemmer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/package.html?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/package.html (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/package.html Thu Feb 4 12:41:56 2010
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Hindi.
+</body>
+</html>
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/package.html
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilter.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilter.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilter.java Thu Feb 4 12:41:56 2010
@@ -0,0 +1,47 @@
+package org.apache.lucene.analysis.in;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link IndicNormalizer} to normalize text
+ * in Indian Languages.
+ */
+public final class IndicNormalizationFilter extends TokenFilter {
+ private final TermAttribute termAtt = addAttribute(TermAttribute.class);
+ private final IndicNormalizer normalizer = new IndicNormalizer();
+
+ public IndicNormalizationFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ termAtt.setTermLength(normalizer.normalize(termAtt.termBuffer(), termAtt.termLength()));
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java Thu Feb 4 12:41:56 2010
@@ -0,0 +1,303 @@
+package org.apache.lucene.analysis.in;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.BitSet;
+import java.util.IdentityHashMap;
+import static java.lang.Character.UnicodeBlock.*;
+
+/**
+ * Normalizes the Unicode representation of text in Indian languages.
+ * <p>
+ * Follows guidelines from Unicode 5.2, chapter 6, South Asian Scripts I
+ * and graphical decompositions from http://ldc.upenn.edu/myl/IndianScriptsUnicode.html
+ * </p>
+ */
+public class IndicNormalizer {
+
+ private static class ScriptData {
+ final int flag;
+ final int base;
+ BitSet decompMask;
+
+ ScriptData(int flag, int base) {
+ this.flag = flag;
+ this.base = base;
+ }
+ }
+
+ private static final IdentityHashMap<Character.UnicodeBlock,ScriptData> scripts =
+ new IdentityHashMap<Character.UnicodeBlock,ScriptData>(9);
+
+ private static int flag(Character.UnicodeBlock ub) {
+ return scripts.get(ub).flag;
+ }
+
+ static {
+ scripts.put(DEVANAGARI, new ScriptData(1, 0x0900));
+ scripts.put(BENGALI, new ScriptData(2, 0x0980));
+ scripts.put(GURMUKHI, new ScriptData(4, 0x0A00));
+ scripts.put(GUJARATI, new ScriptData(8, 0x0A80));
+ scripts.put(ORIYA, new ScriptData(16, 0x0B00));
+ scripts.put(TAMIL, new ScriptData(32, 0x0B80));
+ scripts.put(TELUGU, new ScriptData(64, 0x0C00));
+ scripts.put(KANNADA, new ScriptData(128, 0x0C80));
+ scripts.put(MALAYALAM, new ScriptData(256, 0x0D00));
+ }
+
+ /**
+ * Decompositions according to Unicode 5.2,
+ * and http://ldc.upenn.edu/myl/IndianScriptsUnicode.html
+ *
+ * Most of these are not handled by unicode normalization anyway.
+ *
+ * The numbers here represent offsets into the respective codepages,
+ * with -1 representing null and 0xFF representing zero-width joiner.
+ *
+ * the columns are: ch1, ch2, ch3, res, flags
+ * ch1, ch2, and ch3 are the decomposition
+ * res is the composition, and flags are the scripts to which it applies.
+ */
+ private static final int decompositions[][] = {
+ /* devanagari, gujarati vowel candra O */
+ { 0x05, 0x3E, 0x45, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
+ /* devanagari short O */
+ { 0x05, 0x3E, 0x46, 0x12, flag(DEVANAGARI) },
+ /* devanagari, gujarati letter O */
+ { 0x05, 0x3E, 0x47, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
+ /* devanagari letter AI, gujarati letter AU */
+ { 0x05, 0x3E, 0x48, 0x14, flag(DEVANAGARI) | flag(GUJARATI) },
+ /* devanagari, bengali, gurmukhi, gujarati, oriya AA */
+ { 0x05, 0x3E, -1, 0x06, flag(DEVANAGARI) | flag(BENGALI) | flag(GURMUKHI) | flag(GUJARATI) | flag(ORIYA) },
+ /* devanagari letter candra A */
+ { 0x05, 0x45, -1, 0x72, flag(DEVANAGARI) },
+ /* gujarati vowel candra E */
+ { 0x05, 0x45, -1, 0x0D, flag(GUJARATI) },
+ /* devanagari letter short A */
+ { 0x05, 0x46, -1, 0x04, flag(DEVANAGARI) },
+ /* gujarati letter E */
+ { 0x05, 0x47, -1, 0x0F, flag(GUJARATI) },
+ /* gurmukhi, gujarati letter AI */
+ { 0x05, 0x48, -1, 0x10, flag(GURMUKHI) | flag(GUJARATI) },
+ /* devanagari, gujarati vowel candra O */
+ { 0x05, 0x49, -1, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
+ /* devanagari short O */
+ { 0x05, 0x4A, -1, 0x12, flag(DEVANAGARI) },
+ /* devanagari, gujarati letter O */
+ { 0x05, 0x4B, -1, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
+ /* devanagari letter AI, gurmukhi letter AU, gujarati letter AU */
+ { 0x05, 0x4C, -1, 0x14, flag(DEVANAGARI) | flag(GURMUKHI) | flag(GUJARATI) },
+ /* devanagari, gujarati vowel candra O */
+ { 0x06, 0x45, -1, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
+ /* devanagari short O */
+ { 0x06, 0x46, -1, 0x12, flag(DEVANAGARI) },
+ /* devanagari, gujarati letter O */
+ { 0x06, 0x47, -1, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
+ /* devanagari letter AI, gujarati letter AU */
+ { 0x06, 0x48, -1, 0x14, flag(DEVANAGARI) | flag(GUJARATI) },
+ /* malayalam letter II */
+ { 0x07, 0x57, -1, 0x08, flag(MALAYALAM) },
+ /* devanagari letter UU */
+ { 0x09, 0x41, -1, 0x0A, flag(DEVANAGARI) },
+ /* tamil, malayalam letter UU (some styles) */
+ { 0x09, 0x57, -1, 0x0A, flag(TAMIL) | flag(MALAYALAM) },
+ /* malayalam letter AI */
+ { 0x0E, 0x46, -1, 0x10, flag(MALAYALAM) },
+ /* devanagari candra E */
+ { 0x0F, 0x45, -1, 0x0D, flag(DEVANAGARI) },
+ /* devanagari short E */
+ { 0x0F, 0x46, -1, 0x0E, flag(DEVANAGARI) },
+ /* devanagari AI */
+ { 0x0F, 0x47, -1, 0x10, flag(DEVANAGARI) },
+ /* oriya AI */
+ { 0x0F, 0x57, -1, 0x10, flag(ORIYA) },
+ /* malayalam letter OO */
+ { 0x12, 0x3E, -1, 0x13, flag(MALAYALAM) },
+ /* telugu, kannada letter AU */
+ { 0x12, 0x4C, -1, 0x14, flag(TELUGU) | flag(KANNADA) },
+ /* telugu letter OO */
+ { 0x12, 0x55, -1, 0x13, flag(TELUGU) },
+ /* tamil, malayalam letter AU */
+ { 0x12, 0x57, -1, 0x14, flag(TAMIL) | flag(MALAYALAM) },
+ /* oriya letter AU */
+ { 0x13, 0x57, -1, 0x14, flag(ORIYA) },
+ /* devanagari qa */
+ { 0x15, 0x3C, -1, 0x58, flag(DEVANAGARI) },
+ /* devanagari, gurmukhi khha */
+ { 0x16, 0x3C, -1, 0x59, flag(DEVANAGARI) | flag(GURMUKHI) },
+ /* devanagari, gurmukhi ghha */
+ { 0x17, 0x3C, -1, 0x5A, flag(DEVANAGARI) | flag(GURMUKHI) },
+ /* devanagari, gurmukhi za */
+ { 0x1C, 0x3C, -1, 0x5B, flag(DEVANAGARI) | flag(GURMUKHI) },
+ /* devanagari dddha, bengali, oriya rra */
+ { 0x21, 0x3C, -1, 0x5C, flag(DEVANAGARI) | flag(BENGALI) | flag(ORIYA) },
+ /* devanagari, bengali, oriya rha */
+ { 0x22, 0x3C, -1, 0x5D, flag(DEVANAGARI) | flag(BENGALI) | flag(ORIYA) },
+ /* malayalam chillu nn */
+ { 0x23, 0x4D, 0xFF, 0x7A, flag(MALAYALAM) },
+ /* bengali khanda ta */
+ { 0x24, 0x4D, 0xFF, 0x4E, flag(BENGALI) },
+ /* devanagari nnna */
+ { 0x28, 0x3C, -1, 0x29, flag(DEVANAGARI) },
+ /* malayalam chillu n */
+ { 0x28, 0x4D, 0xFF, 0x7B, flag(MALAYALAM) },
+ /* devanagari, gurmukhi fa */
+ { 0x2B, 0x3C, -1, 0x5E, flag(DEVANAGARI) | flag(GURMUKHI) },
+ /* devanagari, bengali yya */
+ { 0x2F, 0x3C, -1, 0x5F, flag(DEVANAGARI) | flag(BENGALI) },
+ /* telugu letter vocalic R */
+ { 0x2C, 0x41, 0x41, 0x0B, flag(TELUGU) },
+ /* devanagari rra */
+ { 0x30, 0x3C, -1, 0x31, flag(DEVANAGARI) },
+ /* malayalam chillu rr */
+ { 0x30, 0x4D, 0xFF, 0x7C, flag(MALAYALAM) },
+ /* malayalam chillu l */
+ { 0x32, 0x4D, 0xFF, 0x7D, flag(MALAYALAM) },
+ /* devanagari llla */
+ { 0x33, 0x3C, -1, 0x34, flag(DEVANAGARI) },
+ /* malayalam chillu ll */
+ { 0x33, 0x4D, 0xFF, 0x7E, flag(MALAYALAM) },
+ /* telugu letter MA */
+ { 0x35, 0x41, -1, 0x2E, flag(TELUGU) },
+ /* devanagari, gujarati vowel sign candra O */
+ { 0x3E, 0x45, -1, 0x49, flag(DEVANAGARI) | flag(GUJARATI) },
+ /* devanagari vowel sign short O */
+ { 0x3E, 0x46, -1, 0x4A, flag(DEVANAGARI) },
+ /* devanagari, gujarati vowel sign O */
+ { 0x3E, 0x47, -1, 0x4B, flag(DEVANAGARI) | flag(GUJARATI) },
+ /* devanagari, gujarati vowel sign AU */
+ { 0x3E, 0x48, -1, 0x4C, flag(DEVANAGARI) | flag(GUJARATI) },
+ /* kannada vowel sign II */
+ { 0x3F, 0x55, -1, 0x40, flag(KANNADA) },
+ /* gurmukhi vowel sign UU (when stacking) */
+ { 0x41, 0x41, -1, 0x42, flag(GURMUKHI) },
+ /* tamil, malayalam vowel sign O */
+ { 0x46, 0x3E, -1, 0x4A, flag(TAMIL) | flag(MALAYALAM) },
+ /* kannada vowel sign OO */
+ { 0x46, 0x42, 0x55, 0x4B, flag(KANNADA) },
+ /* kannada vowel sign O */
+ { 0x46, 0x42, -1, 0x4A, flag(KANNADA) },
+ /* malayalam vowel sign AI (if reordered twice) */
+ { 0x46, 0x46, -1, 0x48, flag(MALAYALAM) },
+ /* telugu, kannada vowel sign EE */
+ { 0x46, 0x55, -1, 0x47, flag(TELUGU) | flag(KANNADA) },
+ /* telugu, kannada vowel sign AI */
+ { 0x46, 0x56, -1, 0x48, flag(TELUGU) | flag(KANNADA) },
+ /* tamil, malayalam vowel sign AU */
+ { 0x46, 0x57, -1, 0x4C, flag(TAMIL) | flag(MALAYALAM) },
+ /* bengali, oriya vowel sign O, tamil, malayalam vowel sign OO */
+ { 0x47, 0x3E, -1, 0x4B, flag(BENGALI) | flag(ORIYA) | flag(TAMIL) | flag(MALAYALAM) },
+ /* bengali, oriya vowel sign AU */
+ { 0x47, 0x57, -1, 0x4C, flag(BENGALI) | flag(ORIYA) },
+ /* kannada vowel sign OO */
+ { 0x4A, 0x55, -1, 0x4B, flag(KANNADA) },
+ /* gurmukhi letter I */
+ { 0x72, 0x3F, -1, 0x07, flag(GURMUKHI) },
+ /* gurmukhi letter II */
+ { 0x72, 0x40, -1, 0x08, flag(GURMUKHI) },
+ /* gurmukhi letter EE */
+ { 0x72, 0x47, -1, 0x0F, flag(GURMUKHI) },
+ /* gurmukhi letter U */
+ { 0x73, 0x41, -1, 0x09, flag(GURMUKHI) },
+ /* gurmukhi letter UU */
+ { 0x73, 0x42, -1, 0x0A, flag(GURMUKHI) },
+ /* gurmukhi letter OO */
+ { 0x73, 0x4B, -1, 0x13, flag(GURMUKHI) },
+ };
+
+ static {
+ for (ScriptData sd : scripts.values()) {
+ sd.decompMask = new BitSet(0x7F);
+ for (int i = 0; i < decompositions.length; i++) {
+ final int ch = decompositions[i][0];
+ final int flags = decompositions[i][4];
+ if ((flags & sd.flag) != 0)
+ sd.decompMask.set(ch);
+ }
+ }
+ }
+
+ /**
+ * Normalizes input text, and returns the new length.
+ * The length will always be less than or equal to the existing length.
+ *
+ * @param text input text
+ * @param len valid length
+ * @return normalized length
+ */
+ public int normalize(char text[], int len) {
+ for (int i = 0; i < len; i++) {
+ final Character.UnicodeBlock block = Character.UnicodeBlock.of(text[i]);
+ final ScriptData sd = scripts.get(block);
+ if (sd != null) {
+ final int ch = text[i] - sd.base;
+ if (sd.decompMask.get(ch))
+ len = compose(ch, block, sd, text, i, len);
+ }
+ }
+ return len;
+ }
+
+ /**
+ * Compose into standard form any compositions in the decompositions table.
+ */
+ private int compose(int ch0, Character.UnicodeBlock block0, ScriptData sd,
+ char text[], int pos, int len) {
+ if (pos + 1 >= len) /* need at least 2 chars! */
+ return len;
+
+ final int ch1 = text[pos + 1] - sd.base;
+ final Character.UnicodeBlock block1 = Character.UnicodeBlock.of(text[pos + 1]);
+ if (block1 != block0) /* needs to be the same writing system */
+ return len;
+
+ int ch2 = -1;
+
+ if (pos + 2 < len) {
+ ch2 = text[pos + 2] - sd.base;
+ Character.UnicodeBlock block2 = Character.UnicodeBlock.of(text[pos + 2]);
+ if (text[pos + 2] == '\u200D') // ZWJ
+ ch2 = 0xFF;
+ else if (block2 != block1) // still allow a 2-char match
+ ch2 = -1;
+ }
+
+ for (int i = 0; i < decompositions.length; i++)
+ if (decompositions[i][0] == ch0 && (decompositions[i][4] & sd.flag) != 0) {
+ if (decompositions[i][1] == ch1 && (decompositions[i][2] < 0 || decompositions[i][2] == ch2)) {
+ text[pos] = (char) (sd.base + decompositions[i][3]);
+ len = delete(text, pos + 1, len);
+ if (decompositions[i][2] >= 0)
+ len = delete(text, pos + 1, len);
+ return len;
+ }
+ }
+
+ return len;
+ }
+
+ /**
+ * Delete a character in-place
+ */
+ private int delete(char s[], int pos, int len) {
+ if (pos < len)
+ System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
+
+ return len - 1;
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java Thu Feb 4 12:41:56 2010
@@ -0,0 +1,50 @@
+package org.apache.lucene.analysis.in;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.CharTokenizer;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.Version;
+
+/**
+ * Simple Tokenizer for text in Indian Languages.
+ */
+public final class IndicTokenizer extends CharTokenizer {
+
+ public IndicTokenizer(Version matchVersion, AttributeFactory factory, Reader input) {
+ super(matchVersion, factory, input);
+ }
+
+ public IndicTokenizer(Version matchVersion, AttributeSource source, Reader input) {
+ super(matchVersion, source, input);
+ }
+
+ public IndicTokenizer(Version matchVersion, Reader input) {
+ super(matchVersion, input);
+ }
+
+ @Override
+ protected boolean isTokenChar(int c) {
+ return Character.isLetter(c)
+ || Character.getType(c) == Character.NON_SPACING_MARK
+ || Character.getType(c) == Character.FORMAT
+ || Character.getType(c) == Character.COMBINING_SPACING_MARK;
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/package.html?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/package.html (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/package.html Thu Feb 4 12:41:56 2010
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analysis components for Indian languages.
+</body>
+</html>
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/package.html
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt Thu Feb 4 12:41:56 2010
@@ -0,0 +1,231 @@
+# Also see http://www.opensource.org/licenses/bsd-license.html
+# See http://members.unine.ch/jacques.savoy/clef/index.html.
+# This file was created by Jacques Savoy and is distributed under the BSD license.
+à¤
à¤à¤¦à¤°
+à¤
त
+à¤
पना
+à¤
पनà¥
+à¤
पनà¥
+à¤
à¤à¥
+à¤à¤¦à¤¿
+à¤à¤ª
+à¤à¤¤à¥à¤¯à¤¾à¤¦à¤¿
+à¤à¤¨
+à¤à¤¨à¤à¤¾
+à¤à¤¨à¥à¤¹à¥à¤
+à¤à¤¨à¥à¤¹à¥à¤
+à¤à¤¨à¥à¤¹à¥à¤
+à¤à¤¸
+à¤à¤¸à¤à¤¾
+à¤à¤¸à¤à¥
+à¤à¤¸à¤à¥
+à¤à¤¸à¤®à¥à¤
+à¤à¤¸à¥
+à¤à¤¸à¥
+à¤à¤¨
+à¤à¤¨à¤à¤¾
+à¤à¤¨à¤à¥
+à¤à¤¨à¤à¥
+à¤à¤¨à¤à¥
+à¤à¤¨à¥à¤¹à¥à¤
+à¤à¤¨à¥à¤¹à¥à¤
+à¤à¤¨à¥à¤¹à¥à¤
+à¤à¤¸
+à¤à¤¸à¤à¥
+à¤à¤¸à¥
+à¤à¤¸à¥
+à¤à¤
+à¤à¤µà¤
+à¤à¤¸
+à¤à¤¸à¥
+à¤à¤°
+à¤à¤
+à¤à¤°
+à¤à¤°à¤¤à¤¾
+à¤à¤°à¤¤à¥
+à¤à¤°à¤¨à¤¾
+à¤à¤°à¤¨à¥
+à¤à¤°à¥à¤
+à¤à¤¹à¤¤à¥
+à¤à¤¹à¤¾
+à¤à¤¾
+à¤à¤¾à¥à¥
+à¤à¤¿
+à¤à¤¿à¤¤à¤¨à¤¾
+à¤à¤¿à¤¨à¥à¤¹à¥à¤
+à¤à¤¿à¤¨à¥à¤¹à¥à¤
+à¤à¤¿à¤¯à¤¾
+à¤à¤¿à¤°
+à¤à¤¿à¤¸
+à¤à¤¿à¤¸à¥
+à¤à¤¿à¤¸à¥
+à¤à¥
+à¤à¥à¤
+à¤à¥à¤²
+à¤à¥
+à¤à¥
+à¤à¥à¤
+à¤à¥à¤¨
+à¤à¥à¤¨à¤¸à¤¾
+à¤à¤¯à¤¾
+à¤à¤°
+à¤à¤¬
+à¤à¤¹à¤¾à¤
+à¤à¤¾
+à¤à¤¿à¤¤à¤¨à¤¾
+à¤à¤¿à¤¨
+à¤à¤¿à¤¨à¥à¤¹à¥à¤
+à¤à¤¿à¤¨à¥à¤¹à¥à¤
+à¤à¤¿à¤¸
+à¤à¤¿à¤¸à¥
+à¤à¥à¤§à¤°
+à¤à¥à¤¸à¤¾
+à¤à¥à¤¸à¥
+à¤à¥
+तà¤
+तब
+तरह
+तिन
+तिनà¥à¤¹à¥à¤
+तिनà¥à¤¹à¥à¤
+तिस
+तिसà¥
+तà¥
+था
+थà¥
+थà¥
+दबारा
+दिया
+दà¥à¤¸à¤°à¤¾
+दà¥à¤¸à¤°à¥
+दà¥
+दà¥à¤µà¤¾à¤°à¤¾
+न
+नहà¥à¤
+ना
+निहायत
+नà¥à¤à¥
+नà¥
+पर
+पर
+पहलà¥
+पà¥à¤°à¤¾
+पà¥
+फिर
+बनà¥
+बहà¥
+बहà¥à¤¤
+बाद
+बाला
+बिलà¤à¥à¤²
+à¤à¥
+à¤à¥à¤¤à¤°
+मà¤à¤°
+मानà¥
+मà¥
+मà¥à¤
+यदि
+यह
+यहाà¤
+यहà¥
+या
+यिह
+यà¥
+रà¤à¥à¤
+रहा
+रहà¥
+ऱà¥à¤µà¤¾à¤¸à¤¾
+लिà¤
+लियà¥
+लà¥à¤à¤¿à¤¨
+व
+वरà¥à¤
+वह
+वह
+वहाà¤
+वहà¥à¤
+वालà¥
+वà¥à¤¹
+वà¥
+वà¥à¥à¤°à¤¹
+सà¤à¤
+सà¤à¤¤à¤¾
+सà¤à¤¤à¥
+सबसà¥
+सà¤à¥
+साथ
+साबà¥à¤¤
+साà¤
+सारा
+सà¥
+सà¥
+हà¥
+हà¥à¤
+हà¥à¤
+हà¥à¤
+हà¥
+हà¥à¤
+हà¥
+हà¥à¤¤à¤¾
+हà¥à¤¤à¥
+हà¥à¤¤à¥
+हà¥à¤¨à¤¾
+हà¥à¤¨à¥
+# additional normalized forms of the above
+à¤
पनि
+à¤à¥à¤¸à¥
+हà¥à¤¤à¤¿
+सà¤à¤¿
+तिà¤à¤¹à¥à¤
+à¤à¤à¤¹à¥à¤
+दवारा
+à¤à¤¸à¤¿
+à¤à¤¿à¤à¤¹à¥à¤
+थि
+à¤à¤à¤¹à¥à¤
+à¤à¤°
+à¤à¤¿à¤à¤¹à¥à¤
+वहिà¤
+à¤
à¤à¤¿
+बनि
+हि
+à¤à¤à¤¹à¤¿à¤
+à¤à¤à¤¹à¥à¤
+हà¥à¤
+वà¤à¥à¤°à¤¹
+à¤à¤¸à¥
+रवासा
+à¤à¥à¤¨
+निà¤à¥
+à¤à¤¾à¤«à¤¿
+à¤à¤¸à¤¿
+पà¥à¤°à¤¾
+à¤à¤¿à¤¤à¤°
+हà¥
+बहि
+वहाà¤
+à¤à¥à¤
+यहाà¤
+à¤à¤¿à¤à¤¹à¥à¤
+तिà¤à¤¹à¥à¤
+à¤à¤¿à¤¸à¤¿
+à¤à¤
+यहि
+à¤à¤à¤¹à¤¿à¤
+à¤à¤¿à¤§à¤°
+à¤à¤à¤¹à¥à¤
+à¤
दि
+à¤à¤¤à¤¯à¤¾à¤¦à¤¿
+हà¥à¤
+à¤à¥à¤¨à¤¸à¤¾
+à¤à¤¸à¤à¤¿
+दà¥à¤¸à¤°à¥
+à¤à¤¹à¤¾à¤
+à¤
प
+à¤à¤¿à¤à¤¹à¥à¤
+à¤à¤¨à¤à¤¿
+à¤à¤¿
+वरà¤
+हà¥à¤
+à¤à¥à¤¸à¤¾
+नहिà¤
Propchange: lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java Thu Feb 4 12:41:56 2010
@@ -0,0 +1,51 @@
+package org.apache.lucene.analysis.hi;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests the HindiAnalyzer
+ */
+public class TestHindiAnalyzer extends BaseTokenStreamTestCase {
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new HindiAnalyzer(Version.LUCENE_CURRENT);
+ }
+
+ public void testBasics() throws Exception {
+ Analyzer a = new HindiAnalyzer(Version.LUCENE_CURRENT);
+ // two ways to write 'hindi' itself.
+ checkOneTermReuse(a, "हिनà¥à¤¦à¥", "हिà¤à¤¦");
+ checkOneTermReuse(a, "हिà¤à¤¦à¥", "हिà¤à¤¦");
+ }
+
+ public void testExclusionSet() throws Exception {
+ Set<String> exclusionSet = new HashSet<String>();
+ exclusionSet.add("हिà¤à¤¦à¥");
+ Analyzer a = new HindiAnalyzer(Version.LUCENE_CURRENT,
+ HindiAnalyzer.getDefaultStopSet(), exclusionSet);
+ checkOneTermReuse(a, "हिà¤à¤¦à¥", "हिà¤à¤¦à¥");
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiNormalizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiNormalizer.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiNormalizer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiNormalizer.java Thu Feb 4 12:41:56 2010
@@ -0,0 +1,68 @@
+package org.apache.lucene.analysis.hi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.util.Version;
+
+/**
+ * Test HindiNormalizer
+ */
+public class TestHindiNormalizer extends BaseTokenStreamTestCase {
+ /**
+ * Test some basic normalization, with an example from the paper.
+ */
+ public void testBasics() throws IOException {
+ check("à¤
à¤à¤à¤°à¥à¤à¤¼à¥", "à¤
à¤à¤à¤°à¥à¤à¤¿");
+ check("à¤
à¤à¤à¤°à¥à¤à¥", "à¤
à¤à¤à¤°à¥à¤à¤¿");
+ check("à¤
à¤à¤à¥à¤°à¥à¤à¤¼à¥", "à¤
à¤à¤à¤°à¥à¤à¤¿");
+ check("à¤
à¤à¤à¥à¤°à¥à¤à¥", "à¤
à¤à¤à¤°à¥à¤à¤¿");
+ check("à¤
à¤à¤à¤°à¥à¤à¤¼à¥", "à¤
à¤à¤à¤°à¥à¤à¤¿");
+ check("à¤
à¤à¤à¤°à¥à¤à¥", "à¤
à¤à¤à¤°à¥à¤à¤¿");
+ check("à¤
à¤à¤à¥à¤°à¥à¤à¤¼à¥", "à¤
à¤à¤à¤°à¥à¤à¤¿");
+ check("à¤
à¤à¤à¥à¤°à¥à¤à¥", "à¤
à¤à¤à¤°à¥à¤à¤¿");
+ }
+
+ public void testDecompositions() throws IOException {
+ // removing nukta dot
+ check("à¥à¤¿à¤¤à¤¾à¤¬", "à¤à¤¿à¤¤à¤¾à¤¬");
+ check("à¥à¤°à¥à¥", "फरà¤");
+ check("à¥à¤°à¥à¥", "à¤à¤°à¤");
+ // some other composed nukta forms
+ check("ऱऴà¥à¥à¥à¥à¥", "रळà¤à¤à¤¡à¤¢à¤¯");
+ // removal of format (ZWJ/ZWNJ)
+ check("शारà¥âमा", "शारमा");
+ check("शारà¥âमा", "शारमा");
+ // removal of chandra
+ check("à¥
à¥à¥à¥à¤à¤à¤à¤\u0972", "à¥à¥à¥à¥à¤à¤à¤à¤à¤
");
+ // vowel shortening
+ check("à¤à¤à¤à¥ ॡà¤à¤à¥à¥à¥à¥£à¥à¥", "à¤
à¤à¤à¤à¤à¤à¤à¤¿à¥à¥à¥¢à¥à¥");
+ }
+ private void check(String input, String output) throws IOException {
+ Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT,
+ new StringReader(input));
+ TokenFilter tf = new HindiNormalizationFilter(tokenizer);
+ assertTokenStreamContents(tf, new String[] { output });
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiNormalizer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiStemmer.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiStemmer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiStemmer.java Thu Feb 4 12:41:56 2010
@@ -0,0 +1,90 @@
+package org.apache.lucene.analysis.hi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.util.Version;
+
+/**
+ * Test HindiStemmer
+ */
+public class TestHindiStemmer extends BaseTokenStreamTestCase {
+ /**
+ * Test masc noun inflections
+ */
+ public void testMasculineNouns() throws IOException {
+ check("लडà¤à¤¾", "लडà¤");
+ check("लडà¤à¥", "लडà¤");
+ check("लडà¤à¥à¤", "लडà¤");
+
+ check("à¤à¥à¤°à¥", "à¤à¥à¤°");
+ check("à¤à¥à¤°à¥à¤à¤", "à¤à¥à¤°");
+
+ check("दà¥à¤¸à¥à¤¤", "दà¥à¤¸à¥à¤¤");
+ check("दà¥à¤¸à¥à¤¤à¥à¤", "दà¥à¤¸à¥à¤¤");
+ }
+
+ /**
+ * Test feminine noun inflections
+ */
+ public void testFeminineNouns() throws IOException {
+ check("लडà¤à¥", "लडà¤");
+ check("लडà¤à¤¿à¤¯à¥à¤", "लडà¤");
+
+ check("à¤à¤¿à¤¤à¤¾à¤¬", "à¤à¤¿à¤¤à¤¾à¤¬");
+ check("à¤à¤¿à¤¤à¤¾à¤¬à¥à¤", "à¤à¤¿à¤¤à¤¾à¤¬");
+ check("à¤à¤¿à¤¤à¤¾à¤¬à¥à¤", "à¤à¤¿à¤¤à¤¾à¤¬");
+
+ check("à¤à¤§à¥à¤¯à¤¾à¤ªà¥à¤à¤¾", "à¤à¤§à¥à¤¯à¤¾à¤ªà¥à¤");
+ check("à¤à¤§à¥à¤¯à¤¾à¤ªà¥à¤à¤¾à¤à¤", "à¤à¤§à¥à¤¯à¤¾à¤ªà¥à¤");
+ check("à¤à¤§à¥à¤¯à¤¾à¤ªà¥à¤à¤¾à¤à¤", "à¤à¤§à¥à¤¯à¤¾à¤ªà¥à¤");
+ }
+
+ /**
+ * Test some verb forms
+ */
+ public void testVerbs() throws IOException {
+ check("à¤à¤¾à¤¨à¤¾", "à¤à¤¾");
+ check("à¤à¤¾à¤¤à¤¾", "à¤à¤¾");
+ check("à¤à¤¾à¤¤à¥", "à¤à¤¾");
+ check("à¤à¤¾", "à¤à¤¾");
+ }
+
+ /**
+ * From the paper: since the suffix list for verbs includes AI, awA and anI,
+ * additional suffixes had to be added to the list for noun/adjectives
+ * ending with these endings.
+ */
+ public void testExceptions() throws IOException {
+ check("à¤à¤ िनाà¤à¤¯à¤¾à¤", "à¤à¤ िन");
+ check("à¤à¤ िन", "à¤à¤ िन");
+ }
+
+ private void check(String input, String output) throws IOException {
+ Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT,
+ new StringReader(input));
+ TokenFilter tf = new HindiStemFilter(tokenizer);
+ assertTokenStreamContents(tf, new String[] { output });
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiStemmer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicNormalizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicNormalizer.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicNormalizer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicNormalizer.java Thu Feb 4 12:41:56 2010
@@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.in;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.util.Version;
+
+/**
+ * Test IndicNormalizer
+ */
+public class TestIndicNormalizer extends BaseTokenStreamTestCase {
+ /**
+ * Test some basic normalization
+ */
+ public void testBasics() throws IOException {
+ check("à¤
ाà¥
à¤
ाà¥
", "à¤à¤");
+ check("à¤
ाà¥à¤
ाà¥", "à¤à¤");
+ check("à¤
ाà¥à¤
ाà¥", "à¤à¤");
+ check("à¤
ाà¥à¤
ाà¥", "à¤à¤");
+ check("à¤
ाà¤
ा", "à¤à¤");
+ check("à¤
ाà¥à¤°", "à¤à¤°");
+ // khanda-ta
+ check("তà§â", "à§");
+ }
+
+ private void check(String input, String output) throws IOException {
+ Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT,
+ new StringReader(input));
+ TokenFilter tf = new IndicNormalizationFilter(tokenizer);
+ assertTokenStreamContents(tf, new String[] { output });
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicNormalizer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicTokenizer.java?rev=906468&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicTokenizer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicTokenizer.java Thu Feb 4 12:41:56 2010
@@ -0,0 +1,45 @@
+package org.apache.lucene.analysis.in;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.Version;
+
+/**
+ * Test IndicTokenizer
+ */
+public class TestIndicTokenizer extends BaseTokenStreamTestCase {
+ /** Test tokenizing Indic vowels, signs, and punctuation */
+ public void testBasics() throws IOException {
+ TokenStream ts = new IndicTokenizer(Version.LUCENE_CURRENT,
+ new StringReader("मà¥à¤à¥ हिà¤à¤¦à¥ à¤à¤¾ à¤à¤° à¤
à¤à¥à¤¯à¤¾à¤¸ à¤à¤°à¤¨à¤¾ हà¥à¤à¤¾ ।"));
+ assertTokenStreamContents(ts,
+ new String[] { "मà¥à¤à¥", "हिà¤à¤¦à¥", "à¤à¤¾", "à¤à¤°", "à¤
à¤à¥à¤¯à¤¾à¤¸", "à¤à¤°à¤¨à¤¾", "हà¥à¤à¤¾" });
+ }
+
+ /** Test that words with format chars such as ZWJ are kept */
+ public void testFormat() throws Exception {
+ TokenStream ts = new IndicTokenizer(Version.LUCENE_CURRENT,
+ new StringReader("शारà¥âमा शारà¥âमा"));
+ assertTokenStreamContents(ts, new String[] { "शारà¥âमा", "शारà¥âमा" });
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicTokenizer.java
------------------------------------------------------------------------------
svn:eol-style = native