You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/03/24 16:59:05 UTC
svn commit: r1304836 - in /lucene/dev/trunk: lucene/contrib/
modules/analysis/common/src/java/org/apache/lucene/analysis/ga/
modules/analysis/common/src/java/org/tartarus/snowball/ext/
modules/analysis/common/src/resources/org/apache/lucene/analysis/ga...
Author: rmuir
Date: Sat Mar 24 15:59:04 2012
New Revision: 1304836
URL: http://svn.apache.org/viewvc?rev=1304836&view=rev
Log:
LUCENE-3883: Irish Analyzer
Added:
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ga/
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java (with props)
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.java (with props)
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ga/irish.sbl.txt
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ga/package.html (with props)
lucene/dev/trunk/modules/analysis/common/src/java/org/tartarus/snowball/ext/IrishStemmer.java (with props)
lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/ga/
lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/ga/stopwords.txt (with props)
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/ga/
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishAnalyzer.java (with props)
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishLowerCaseFilter.java (with props)
lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/IrishLowerCaseFilterFactory.java (with props)
lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestIrishLowerCaseFilterFactory.java (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/contractions_ga.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/hyphenations_ga.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ga.txt (with props)
Modified:
lucene/dev/trunk/lucene/contrib/CHANGES.txt
lucene/dev/trunk/solr/build.xml
lucene/dev/trunk/solr/example/solr/conf/schema.xml
Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=1304836&r1=1304835&r2=1304836&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Sat Mar 24 15:59:04 2012
@@ -189,6 +189,8 @@ New Features
* LUCENE-3714: Add WFSTCompletionLookup suggester that supports more fine-grained
ranking for suggestions. (Mike McCandless, Dawid Weiss, Robert Muir)
+ * LUCENE-3883: Add Analyzer for Irish. (Jim Regan via Robert Muir)
+
API Changes
* LUCENE-3596: DirectoryTaxonomyWriter.openIndexWriter() now takes an
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java?rev=1304836&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java Sat Mar 24 15:59:04 2012
@@ -0,0 +1,151 @@
+package org.apache.lucene.analysis.ga;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Arrays;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.fr.ElisionFilter;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.IrishStemmer;
+
+/**
+ * {@link Analyzer} for Irish.
+ */
+public final class IrishAnalyzer extends StopwordAnalyzerBase {
+ private final CharArraySet stemExclusionSet;
+
+ /** File containing default Irish stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+ private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
+ new CharArraySet(Version.LUCENE_CURRENT,
+ Arrays.asList(
+ "d", "m", "b"
+ ), true));
+
+ /**
+ * When StandardTokenizer splits tâathair into {t, athair}, we don't
+ * want to cause a position increment, otherwise there will be problems
+ * with phrase queries versus tAthair (which would not have a gap).
+ */
+ private static final CharArraySet HYPHENATIONS = CharArraySet.unmodifiableSet(
+ new CharArraySet(Version.LUCENE_CURRENT,
+ Arrays.asList(
+ "h", "n", "t"
+ ), true));
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static CharArraySet getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final CharArraySet DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = loadStopwordSet(false,
+ IrishAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public IrishAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public IrishAnalyzer(Version matchVersion, CharArraySet stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public IrishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a
+ * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+ * which tokenizes all the text in the provided {@link Reader}.
+ *
+ * @return A
+ * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+ * built from an {@link StandardTokenizer} filtered with
+ * {@link StandardFilter}, {@link IrishLowerCaseFilter}, {@link StopFilter}
+ * , {@link KeywordMarkerFilter} if a stem exclusion set is
+ * provided and {@link SnowballFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(matchVersion, source);
+ StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS);
+ s.setEnablePositionIncrements(false);
+ result = s;
+ result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
+ result = new IrishLowerCaseFilter(result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerFilter(result, stemExclusionSet);
+ result = new SnowballFilter(result, new IrishStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+}
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.java?rev=1304836&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.java Sat Mar 24 15:59:04 2012
@@ -0,0 +1,85 @@
+package org.apache.lucene.analysis.ga;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * Normalises token text to lower case, handling t-prothesis
+ * and n-eclipsis (i.e., that 'nAthair' should become 'n-athair')
+ */
+public final class IrishLowerCaseFilter extends TokenFilter {
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+ /**
+ * Create an IrishLowerCaseFilter that normalises Irish token text.
+ */
+ public IrishLowerCaseFilter(TokenStream in) {
+ super(in);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ char[] chArray = termAtt.buffer();
+ int chLen = termAtt.length();
+ int idx = 0;
+
+ if (chLen > 1 && (chArray[0] == 'n' || chArray[0] == 't') && isUpperVowel(chArray[1])) {
+ chArray = termAtt.resizeBuffer(chLen + 1);
+ for (int i = chLen; i > 1; i--) {
+ chArray[i] = chArray[i - 1];
+ }
+ chArray[1] = '-';
+ termAtt.setLength(chLen + 1);
+ idx = 2;
+ chLen = chLen + 1;
+ }
+
+ for (int i = idx; i < chLen;) {
+ i += Character.toChars(Character.toLowerCase(chArray[i]), chArray, i);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ private boolean isUpperVowel (int v) {
+ switch (v) {
+ case 'A':
+ case 'E':
+ case 'I':
+ case 'O':
+ case 'U':
+ // vowels with acute accent (fada)
+ case '\u00c1':
+ case '\u00c9':
+ case '\u00cd':
+ case '\u00d3':
+ case '\u00da':
+ return true;
+ default:
+ return false;
+ }
+ }
+}
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ga/irish.sbl.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ga/irish.sbl.txt?rev=1304836&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ga/irish.sbl.txt (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ga/irish.sbl.txt Sat Mar 24 15:59:04 2012
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ // this is the snowball rules from LUCENE-3883 for reference or for
+ // code-regeneration. we can remove this when its added to snowball.
+
+routines (
+ R1 R2 RV
+ initial_morph
+ mark_regions
+ noun_sfx
+ deriv
+ verb_sfx
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v )
+
+stringescapes {}
+
+/* Latin 1 */
+
+stringdef a' hex 'E1' // a-acute
+stringdef e' hex 'E9' // e-acute
+stringdef i' hex 'ED' // i-acute
+stringdef o' hex 'F3' // o-acute
+stringdef u' hex 'FA' // u-acute
+
+define v 'aeiou{a'}{e'}{i'}{o'}{u'}'
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ gopast v setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define initial_morph as (
+ [substring] among (
+ 'h-' 'n-' 't-' //nAthair -> n-athair, but alone are problematic
+ (delete)
+
+ // verbs
+ 'd{'}'
+ (delete)
+ 'd{'}fh'
+ (<- 'f')
+ // other contractions
+ 'm{'}' 'b{'}'
+ (delete)
+
+ 'sh'
+ (<- 's')
+
+ 'mb'
+ (<- 'b')
+ 'gc'
+ (<- 'c')
+ 'nd'
+ (<- 'd')
+ 'bhf'
+ (<- 'f')
+ 'ng'
+ (<- 'g')
+ 'bp'
+ (<- 'p')
+ 'ts'
+ (<- 's')
+ 'dt'
+ (<- 't')
+
+ // Lenition
+ 'bh'
+ (<- 'b')
+ 'ch'
+ (<- 'c')
+ 'dh'
+ (<- 'd')
+ 'fh'
+ (<- 'f')
+ 'gh'
+ (<- 'g')
+ 'mh'
+ (<- 'm')
+ 'ph'
+ (<- 'p')
+ 'th'
+ (<- 't')
+ )
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define noun_sfx as (
+ [substring] among (
+ 'amh' 'eamh' 'abh' 'eabh'
+ 'aibh' 'ibh' 'aimh' 'imh'
+ 'a{i'}ocht' '{i'}ocht' 'a{i'}ochta' '{i'}ochta'
+ (R1 delete)
+ 'ire' 'ir{i'}' 'aire' 'air{i'}'
+ (R2 delete)
+ )
+ )
+ define deriv as (
+ [substring] among (
+ 'acht' 'eacht' 'ach' 'each' 'eacht{u'}il' 'eachta' 'acht{u'}il' 'achta'
+ (R2 delete) //siopadóireacht -> siopadóir but not poblacht -> pobl
+ 'arcacht' 'arcachta{i'}' 'arcachta'
+ (<- 'arc') // monarcacht -> monarc
+ 'gineach' 'gineas' 'ginis'
+ (<- 'gin')
+ 'grafa{i'}och' 'grafa{i'}ocht' 'grafa{i'}ochta' 'grafa{i'}ochta{i'}'
+ (<- 'graf')
+ 'paite' 'patach' 'pataigh' 'patacha'
+ (<- 'paite')
+ '{o'}ideach' '{o'}ideacha' '{o'}idigh'
+ (<- '{o'}id')
+ )
+ )
+ define verb_sfx as (
+ [substring] among (
+ 'imid' 'aimid' '{i'}mid' 'a{i'}mid'
+ 'faidh' 'fidh'
+ (RV delete)
+ 'ain'
+ 'eadh' 'adh'
+ '{a'}il'
+ 'tear' 'tar'
+ (R1 delete)
+ )
+ )
+)
+
+define stem as (
+ do initial_morph
+ do mark_regions
+ backwards (
+ do noun_sfx
+ do deriv
+ do verb_sfx
+ )
+)
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ga/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ga/package.html?rev=1304836&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ga/package.html (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ga/package.html Sat Mar 24 15:59:04 2012
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analysis for Irish.
+</body>
+</html>
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/tartarus/snowball/ext/IrishStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/tartarus/snowball/ext/IrishStemmer.java?rev=1304836&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/tartarus/snowball/ext/IrishStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/tartarus/snowball/ext/IrishStemmer.java Sat Mar 24 15:59:04 2012
@@ -0,0 +1,587 @@
+// This file was generated automatically by the Snowball to Java compiler
+
+package org.tartarus.snowball.ext;
+
+import org.tartarus.snowball.Among;
+import org.tartarus.snowball.SnowballProgram;
+
+ /**
+ * This class was automatically generated by a Snowball to Java compiler
+ * It implements the stemming algorithm defined by a snowball script.
+ */
+
+public class IrishStemmer extends SnowballProgram {
+
+private static final long serialVersionUID = 1L;
+
+ private final static IrishStemmer methodObject = new IrishStemmer ();
+
+ private final static Among a_0[] = {
+ new Among ( "b'", -1, 4, "", methodObject ),
+ new Among ( "bh", -1, 14, "", methodObject ),
+ new Among ( "bhf", 1, 9, "", methodObject ),
+ new Among ( "bp", -1, 11, "", methodObject ),
+ new Among ( "ch", -1, 15, "", methodObject ),
+ new Among ( "d'", -1, 2, "", methodObject ),
+ new Among ( "d'fh", 5, 3, "", methodObject ),
+ new Among ( "dh", -1, 16, "", methodObject ),
+ new Among ( "dt", -1, 13, "", methodObject ),
+ new Among ( "fh", -1, 17, "", methodObject ),
+ new Among ( "gc", -1, 7, "", methodObject ),
+ new Among ( "gh", -1, 18, "", methodObject ),
+ new Among ( "h-", -1, 1, "", methodObject ),
+ new Among ( "m'", -1, 4, "", methodObject ),
+ new Among ( "mb", -1, 6, "", methodObject ),
+ new Among ( "mh", -1, 19, "", methodObject ),
+ new Among ( "n-", -1, 1, "", methodObject ),
+ new Among ( "nd", -1, 8, "", methodObject ),
+ new Among ( "ng", -1, 10, "", methodObject ),
+ new Among ( "ph", -1, 20, "", methodObject ),
+ new Among ( "sh", -1, 5, "", methodObject ),
+ new Among ( "t-", -1, 1, "", methodObject ),
+ new Among ( "th", -1, 21, "", methodObject ),
+ new Among ( "ts", -1, 12, "", methodObject )
+ };
+
+ private final static Among a_1[] = {
+ new Among ( "\u00EDochta", -1, 1, "", methodObject ),
+ new Among ( "a\u00EDochta", 0, 1, "", methodObject ),
+ new Among ( "ire", -1, 2, "", methodObject ),
+ new Among ( "aire", 2, 2, "", methodObject ),
+ new Among ( "abh", -1, 1, "", methodObject ),
+ new Among ( "eabh", 4, 1, "", methodObject ),
+ new Among ( "ibh", -1, 1, "", methodObject ),
+ new Among ( "aibh", 6, 1, "", methodObject ),
+ new Among ( "amh", -1, 1, "", methodObject ),
+ new Among ( "eamh", 8, 1, "", methodObject ),
+ new Among ( "imh", -1, 1, "", methodObject ),
+ new Among ( "aimh", 10, 1, "", methodObject ),
+ new Among ( "\u00EDocht", -1, 1, "", methodObject ),
+ new Among ( "a\u00EDocht", 12, 1, "", methodObject ),
+ new Among ( "ir\u00ED", -1, 2, "", methodObject ),
+ new Among ( "air\u00ED", 14, 2, "", methodObject )
+ };
+
+ private final static Among a_2[] = {
+ new Among ( "\u00F3ideacha", -1, 6, "", methodObject ),
+ new Among ( "patacha", -1, 5, "", methodObject ),
+ new Among ( "achta", -1, 1, "", methodObject ),
+ new Among ( "arcachta", 2, 2, "", methodObject ),
+ new Among ( "eachta", 2, 1, "", methodObject ),
+ new Among ( "grafa\u00EDochta", -1, 4, "", methodObject ),
+ new Among ( "paite", -1, 5, "", methodObject ),
+ new Among ( "ach", -1, 1, "", methodObject ),
+ new Among ( "each", 7, 1, "", methodObject ),
+ new Among ( "\u00F3ideach", 8, 6, "", methodObject ),
+ new Among ( "gineach", 8, 3, "", methodObject ),
+ new Among ( "patach", 7, 5, "", methodObject ),
+ new Among ( "grafa\u00EDoch", -1, 4, "", methodObject ),
+ new Among ( "pataigh", -1, 5, "", methodObject ),
+ new Among ( "\u00F3idigh", -1, 6, "", methodObject ),
+ new Among ( "acht\u00FAil", -1, 1, "", methodObject ),
+ new Among ( "eacht\u00FAil", 15, 1, "", methodObject ),
+ new Among ( "gineas", -1, 3, "", methodObject ),
+ new Among ( "ginis", -1, 3, "", methodObject ),
+ new Among ( "acht", -1, 1, "", methodObject ),
+ new Among ( "arcacht", 19, 2, "", methodObject ),
+ new Among ( "eacht", 19, 1, "", methodObject ),
+ new Among ( "grafa\u00EDocht", -1, 4, "", methodObject ),
+ new Among ( "arcachta\u00ED", -1, 2, "", methodObject ),
+ new Among ( "grafa\u00EDochta\u00ED", -1, 4, "", methodObject )
+ };
+
+ private final static Among a_3[] = {
+ new Among ( "imid", -1, 1, "", methodObject ),
+ new Among ( "aimid", 0, 1, "", methodObject ),
+ new Among ( "\u00EDmid", -1, 1, "", methodObject ),
+ new Among ( "a\u00EDmid", 2, 1, "", methodObject ),
+ new Among ( "adh", -1, 2, "", methodObject ),
+ new Among ( "eadh", 4, 2, "", methodObject ),
+ new Among ( "faidh", -1, 1, "", methodObject ),
+ new Among ( "fidh", -1, 1, "", methodObject ),
+ new Among ( "\u00E1il", -1, 2, "", methodObject ),
+ new Among ( "ain", -1, 2, "", methodObject ),
+ new Among ( "tear", -1, 2, "", methodObject ),
+ new Among ( "tar", -1, 2, "", methodObject )
+ };
+
+ private static final char g_v[] = {17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 17, 4, 2 };
+
+ private int I_p2;
+ private int I_p1;
+ private int I_pV;
+
+ private void copy_from(IrishStemmer other) {
+ I_p2 = other.I_p2;
+ I_p1 = other.I_p1;
+ I_pV = other.I_pV;
+ super.copy_from(other);
+ }
+
+ private boolean r_mark_regions() {
+ int v_1;
+ int v_3;
+ // (, line 28
+ I_pV = limit;
+ I_p1 = limit;
+ I_p2 = limit;
+ // do, line 34
+ v_1 = cursor;
+ lab0: do {
+ // (, line 34
+ // gopast, line 35
+ golab1: while(true)
+ {
+ lab2: do {
+ if (!(in_grouping(g_v, 97, 250)))
+ {
+ break lab2;
+ }
+ break golab1;
+ } while (false);
+ if (cursor >= limit)
+ {
+ break lab0;
+ }
+ cursor++;
+ }
+ // setmark pV, line 35
+ I_pV = cursor;
+ } while (false);
+ cursor = v_1;
+ // do, line 37
+ v_3 = cursor;
+ lab3: do {
+ // (, line 37
+ // gopast, line 38
+ golab4: while(true)
+ {
+ lab5: do {
+ if (!(in_grouping(g_v, 97, 250)))
+ {
+ break lab5;
+ }
+ break golab4;
+ } while (false);
+ if (cursor >= limit)
+ {
+ break lab3;
+ }
+ cursor++;
+ }
+ // gopast, line 38
+ golab6: while(true)
+ {
+ lab7: do {
+ if (!(out_grouping(g_v, 97, 250)))
+ {
+ break lab7;
+ }
+ break golab6;
+ } while (false);
+ if (cursor >= limit)
+ {
+ break lab3;
+ }
+ cursor++;
+ }
+ // setmark p1, line 38
+ I_p1 = cursor;
+ // gopast, line 39
+ golab8: while(true)
+ {
+ lab9: do {
+ if (!(in_grouping(g_v, 97, 250)))
+ {
+ break lab9;
+ }
+ break golab8;
+ } while (false);
+ if (cursor >= limit)
+ {
+ break lab3;
+ }
+ cursor++;
+ }
+ // gopast, line 39
+ golab10: while(true)
+ {
+ lab11: do {
+ if (!(out_grouping(g_v, 97, 250)))
+ {
+ break lab11;
+ }
+ break golab10;
+ } while (false);
+ if (cursor >= limit)
+ {
+ break lab3;
+ }
+ cursor++;
+ }
+ // setmark p2, line 39
+ I_p2 = cursor;
+ } while (false);
+ cursor = v_3;
+ return true;
+ }
+
+ private boolean r_initial_morph() {
+ int among_var;
+ // (, line 43
+ // [, line 44
+ bra = cursor;
+ // substring, line 44
+ among_var = find_among(a_0, 24);
+ if (among_var == 0)
+ {
+ return false;
+ }
+ // ], line 44
+ ket = cursor;
+ switch(among_var) {
+ case 0:
+ return false;
+ case 1:
+ // (, line 46
+ // delete, line 46
+ slice_del();
+ break;
+ case 2:
+ // (, line 50
+ // delete, line 50
+ slice_del();
+ break;
+ case 3:
+ // (, line 52
+ // <-, line 52
+ slice_from("f");
+ break;
+ case 4:
+ // (, line 55
+ // delete, line 55
+ slice_del();
+ break;
+ case 5:
+ // (, line 58
+ // <-, line 58
+ slice_from("s");
+ break;
+ case 6:
+ // (, line 61
+ // <-, line 61
+ slice_from("b");
+ break;
+ case 7:
+ // (, line 63
+ // <-, line 63
+ slice_from("c");
+ break;
+ case 8:
+ // (, line 65
+ // <-, line 65
+ slice_from("d");
+ break;
+ case 9:
+ // (, line 67
+ // <-, line 67
+ slice_from("f");
+ break;
+ case 10:
+ // (, line 69
+ // <-, line 69
+ slice_from("g");
+ break;
+ case 11:
+ // (, line 71
+ // <-, line 71
+ slice_from("p");
+ break;
+ case 12:
+ // (, line 73
+ // <-, line 73
+ slice_from("s");
+ break;
+ case 13:
+ // (, line 75
+ // <-, line 75
+ slice_from("t");
+ break;
+ case 14:
+ // (, line 79
+ // <-, line 79
+ slice_from("b");
+ break;
+ case 15:
+ // (, line 81
+ // <-, line 81
+ slice_from("c");
+ break;
+ case 16:
+ // (, line 83
+ // <-, line 83
+ slice_from("d");
+ break;
+ case 17:
+ // (, line 85
+ // <-, line 85
+ slice_from("f");
+ break;
+ case 18:
+ // (, line 87
+ // <-, line 87
+ slice_from("g");
+ break;
+ case 19:
+ // (, line 89
+ // <-, line 89
+ slice_from("m");
+ break;
+ case 20:
+ // (, line 91
+ // <-, line 91
+ slice_from("p");
+ break;
+ case 21:
+ // (, line 93
+ // <-, line 93
+ slice_from("t");
+ break;
+ }
+ return true;
+ }
+
+ private boolean r_RV() {
+ if (!(I_pV <= cursor))
+ {
+ return false;
+ }
+ return true;
+ }
+
+ private boolean r_R1() {
+ if (!(I_p1 <= cursor))
+ {
+ return false;
+ }
+ return true;
+ }
+
+ private boolean r_R2() {
+ if (!(I_p2 <= cursor))
+ {
+ return false;
+ }
+ return true;
+ }
+
+ private boolean r_noun_sfx() {
+ int among_var;
+ // (, line 103
+ // [, line 104
+ ket = cursor;
+ // substring, line 104
+ among_var = find_among_b(a_1, 16);
+ if (among_var == 0)
+ {
+ return false;
+ }
+ // ], line 104
+ bra = cursor;
+ switch(among_var) {
+ case 0:
+ return false;
+ case 1:
+ // (, line 108
+ // call R1, line 108
+ if (!r_R1())
+ {
+ return false;
+ }
+ // delete, line 108
+ slice_del();
+ break;
+ case 2:
+ // (, line 110
+ // call R2, line 110
+ if (!r_R2())
+ {
+ return false;
+ }
+ // delete, line 110
+ slice_del();
+ break;
+ }
+ return true;
+ }
+
+ private boolean r_deriv() {
+ int among_var;
+ // (, line 113
+ // [, line 114
+ ket = cursor;
+ // substring, line 114
+ among_var = find_among_b(a_2, 25);
+ if (among_var == 0)
+ {
+ return false;
+ }
+ // ], line 114
+ bra = cursor;
+ switch(among_var) {
+ case 0:
+ return false;
+ case 1:
+ // (, line 116
+ // call R2, line 116
+ if (!r_R2())
+ {
+ return false;
+ }
+ // delete, line 116
+ slice_del();
+ break;
+ case 2:
+ // (, line 118
+ // <-, line 118
+ slice_from("arc");
+ break;
+ case 3:
+ // (, line 120
+ // <-, line 120
+ slice_from("gin");
+ break;
+ case 4:
+ // (, line 122
+ // <-, line 122
+ slice_from("graf");
+ break;
+ case 5:
+ // (, line 124
+ // <-, line 124
+ slice_from("paite");
+ break;
+ case 6:
+ // (, line 126
+ // <-, line 126
+ slice_from("\u00F3id");
+ break;
+ }
+ return true;
+ }
+
+ private boolean r_verb_sfx() {
+ int among_var;
+ // (, line 129
+ // [, line 130
+ ket = cursor;
+ // substring, line 130
+ among_var = find_among_b(a_3, 12);
+ if (among_var == 0)
+ {
+ return false;
+ }
+ // ], line 130
+ bra = cursor;
+ switch(among_var) {
+ case 0:
+ return false;
+ case 1:
+ // (, line 133
+ // call RV, line 133
+ if (!r_RV())
+ {
+ return false;
+ }
+ // delete, line 133
+ slice_del();
+ break;
+ case 2:
+ // (, line 138
+ // call R1, line 138
+ if (!r_R1())
+ {
+ return false;
+ }
+ // delete, line 138
+ slice_del();
+ break;
+ }
+ return true;
+ }
+
+ public boolean stem() {
+ int v_1;
+ int v_2;
+ int v_3;
+ int v_4;
+ int v_5;
+ // (, line 143
+ // do, line 144
+ v_1 = cursor;
+ lab0: do {
+ // call initial_morph, line 144
+ if (!r_initial_morph())
+ {
+ break lab0;
+ }
+ } while (false);
+ cursor = v_1;
+ // do, line 145
+ v_2 = cursor;
+ lab1: do {
+ // call mark_regions, line 145
+ if (!r_mark_regions())
+ {
+ break lab1;
+ }
+ } while (false);
+ cursor = v_2;
+ // backwards, line 146
+ limit_backward = cursor; cursor = limit;
+ // (, line 146
+ // do, line 147
+ v_3 = limit - cursor;
+ lab2: do {
+ // call noun_sfx, line 147
+ if (!r_noun_sfx())
+ {
+ break lab2;
+ }
+ } while (false);
+ cursor = limit - v_3;
+ // do, line 148
+ v_4 = limit - cursor;
+ lab3: do {
+ // call deriv, line 148
+ if (!r_deriv())
+ {
+ break lab3;
+ }
+ } while (false);
+ cursor = limit - v_4;
+ // do, line 149
+ v_5 = limit - cursor;
+ lab4: do {
+ // call verb_sfx, line 149
+ if (!r_verb_sfx())
+ {
+ break lab4;
+ }
+ } while (false);
+ cursor = limit - v_5;
+ cursor = limit_backward; return true;
+ }
+
+ public boolean equals( Object o ) {
+ return o instanceof IrishStemmer;
+ }
+
+ public int hashCode() {
+ return IrishStemmer.class.getName().hashCode();
+ }
+
+
+
+}
+
Added: lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/ga/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/ga/stopwords.txt?rev=1304836&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/ga/stopwords.txt (added)
+++ lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/ga/stopwords.txt Sat Mar 24 15:59:04 2012
@@ -0,0 +1,110 @@
+
+a
+ach
+ag
+agus
+an
+aon
+ar
+arna
+as
+b'
+ba
+beirt
+bhúr
+caoga
+ceathair
+ceathrar
+chomh
+chtó
+chuig
+chun
+cois
+céad
+cúig
+cúigear
+d'
+daichead
+dar
+de
+deich
+deichniúr
+den
+dhá
+do
+don
+dtÃ
+dá
+dár
+dó
+faoi
+faoin
+faoina
+faoinár
+fara
+fiche
+gach
+gan
+go
+gur
+haon
+hocht
+i
+iad
+idir
+in
+ina
+ins
+inár
+is
+le
+leis
+lena
+lenár
+m'
+mar
+mo
+mé
+na
+nach
+naoi
+naonúr
+ná
+nÃ
+nÃor
+nó
+nócha
+ocht
+ochtar
+os
+roimh
+sa
+seacht
+seachtar
+seachtó
+seasca
+seisear
+siad
+sibh
+sinn
+sna
+sé
+sÃ
+tar
+thar
+thú
+triúr
+trÃ
+trÃna
+trÃnár
+trÃocha
+tú
+um
+ár
+é
+éis
+Ã
+ó
+ón
+óna
+ónár
Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishAnalyzer.java?rev=1304836&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishAnalyzer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishAnalyzer.java Sat Mar 24 15:59:04 2012
@@ -0,0 +1,71 @@
+package org.apache.lucene.analysis.ga;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.util.CharArraySet;
+
+public class TestIrishAnalyzer extends BaseTokenStreamTestCase {
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new IrishAnalyzer(TEST_VERSION_CURRENT);
+ }
+
+ /** test stopwords and stemming */
+ public void testBasics() throws IOException {
+ Analyzer a = new IrishAnalyzer(TEST_VERSION_CURRENT);
+ // stemming
+ checkOneTermReuse(a, "siopadóireacht", "siopadóir");
+ checkOneTermReuse(a, "sÃceapatacha", "sÃceapaite");
+ // stopword
+ assertAnalyzesTo(a, "le", new String[] { });
+ }
+
+ /** test use of elisionfilter */
+ public void testContractions() throws IOException {
+ Analyzer a = new IrishAnalyzer(TEST_VERSION_CURRENT);
+ assertAnalyzesTo(a, "b'fhearr m'athair",
+ new String[] { "fearr", "athair" });
+ }
+
+ /** test use of exclusion set */
+ public void testExclude() throws IOException {
+ CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("feirmeoireacht"), false);
+ Analyzer a = new IrishAnalyzer(TEST_VERSION_CURRENT,
+ IrishAnalyzer.getDefaultStopSet(), exclusionSet);
+ checkOneTermReuse(a, "feirmeoireacht", "feirmeoireacht");
+ checkOneTermReuse(a, "siopadóireacht", "siopadóir");
+ }
+
+ /** test special hyphen handling */
+ public void testHyphens() throws IOException {
+ Analyzer a = new IrishAnalyzer(TEST_VERSION_CURRENT);
+ assertAnalyzesTo(a, "n-athair",
+ new String[] { "athair" },
+ new int[] { 1 });
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, new IrishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+ }
+}
Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishLowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishLowerCaseFilter.java?rev=1304836&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishLowerCaseFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishLowerCaseFilter.java Sat Mar 24 15:59:04 2012
@@ -0,0 +1,41 @@
+package org.apache.lucene.analysis.ga;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Test the Irish lowercase filter.
+ */
+public class TestIrishLowerCaseFilter extends BaseTokenStreamTestCase {
+
+ /**
+ * Test lowercase
+ */
+ public void testIrishLowerCaseFilter() throws Exception {
+ TokenStream stream = new MockTokenizer(new StringReader(
+ "nAthair tUISCE hARD"), MockTokenizer.WHITESPACE, false);
+ IrishLowerCaseFilter filter = new IrishLowerCaseFilter(stream);
+ assertTokenStreamContents(filter, new String[] {"n-athair", "t-uisce",
+ "hard",});
+ }
+}
Modified: lucene/dev/trunk/solr/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/build.xml?rev=1304836&r1=1304835&r2=1304836&view=diff
==============================================================================
--- lucene/dev/trunk/solr/build.xml (original)
+++ lucene/dev/trunk/solr/build.xml Sat Mar 24 15:59:04 2012
@@ -672,6 +672,9 @@
<!-- french -->
<copy verbose="true" file="${analysis-common.res.dir}/snowball/french_stop.txt"
tofile="${analysis.conf.dest}/stopwords_fr.txt"/>
+ <!-- irish -->
+ <copy verbose="true" file="${analysis-common.res.dir}/ga/stopwords.txt"
+ tofile="${analysis.conf.dest}/stopwords_ga.txt"/>
<!-- galician -->
<copy verbose="true" file="${analysis-common.res.dir}/gl/stopwords.txt"
tofile="${analysis.conf.dest}/stopwords_gl.txt"/>
Added: lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/IrishLowerCaseFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/IrishLowerCaseFilterFactory.java?rev=1304836&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/IrishLowerCaseFilterFactory.java (added)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/IrishLowerCaseFilterFactory.java Sat Mar 24 15:59:04 2012
@@ -0,0 +1,40 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ga.IrishLowerCaseFilter;
+
+/**
+ * Factory for {@link IrishLowerCaseFilter}.
+ * <pre class="prettyprint" >
+ * <fieldType name="text_ga" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.StandardTokenizerFactory"/>
+ * <filter class="solr.IrishLowerCaseFilterFactory"/>
+ * </analyzer>
+ * </fieldType></pre>
+ *
+ */
+public class IrishLowerCaseFilterFactory extends BaseTokenFilterFactory {
+
+ @Override
+ public TokenStream create(TokenStream input) {
+ return new IrishLowerCaseFilter(input);
+ }
+}
Added: lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestIrishLowerCaseFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestIrishLowerCaseFilterFactory.java?rev=1304836&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestIrishLowerCaseFilterFactory.java (added)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestIrishLowerCaseFilterFactory.java Sat Mar 24 15:59:04 2012
@@ -0,0 +1,36 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Simple tests to ensure the Irish lowercase filter factory is working.
+ */
+public class TestIrishLowerCaseFilterFactory extends BaseTokenTestCase {
+ public void testCasing() throws Exception {
+ Reader reader = new StringReader("nAthair tUISCE hARD");
+ IrishLowerCaseFilterFactory factory = new IrishLowerCaseFilterFactory();
+ TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false));
+ assertTokenStreamContents(stream, new String[] { "n-athair", "t-uisce", "hard" });
+ }
+}
Added: lucene/dev/trunk/solr/example/solr/conf/lang/contractions_ga.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/lang/contractions_ga.txt?rev=1304836&view=auto
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/lang/contractions_ga.txt (added)
+++ lucene/dev/trunk/solr/example/solr/conf/lang/contractions_ga.txt Sat Mar 24 15:59:04 2012
@@ -0,0 +1,5 @@
+# Set of Irish contractions for ElisionFilter
+# TODO: load this as a resource from the analyzer and sync it in build.xml
+d
+m
+b
Added: lucene/dev/trunk/solr/example/solr/conf/lang/hyphenations_ga.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/lang/hyphenations_ga.txt?rev=1304836&view=auto
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/lang/hyphenations_ga.txt (added)
+++ lucene/dev/trunk/solr/example/solr/conf/lang/hyphenations_ga.txt Sat Mar 24 15:59:04 2012
@@ -0,0 +1,5 @@
+# Set of Irish hyphenations for StopFilter
+# TODO: load this as a resource from the analyzer and sync it in build.xml
+h
+n
+t
Added: lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ga.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ga.txt?rev=1304836&view=auto
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ga.txt (added)
+++ lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ga.txt Sat Mar 24 15:59:04 2012
@@ -0,0 +1,110 @@
+
+a
+ach
+ag
+agus
+an
+aon
+ar
+arna
+as
+b'
+ba
+beirt
+bhúr
+caoga
+ceathair
+ceathrar
+chomh
+chtó
+chuig
+chun
+cois
+céad
+cúig
+cúigear
+d'
+daichead
+dar
+de
+deich
+deichniúr
+den
+dhá
+do
+don
+dtÃ
+dá
+dár
+dó
+faoi
+faoin
+faoina
+faoinár
+fara
+fiche
+gach
+gan
+go
+gur
+haon
+hocht
+i
+iad
+idir
+in
+ina
+ins
+inár
+is
+le
+leis
+lena
+lenár
+m'
+mar
+mo
+mé
+na
+nach
+naoi
+naonúr
+ná
+nÃ
+nÃor
+nó
+nócha
+ocht
+ochtar
+os
+roimh
+sa
+seacht
+seachtar
+seachtó
+seasca
+seisear
+siad
+sibh
+sinn
+sna
+sé
+sÃ
+tar
+thar
+thú
+triúr
+trÃ
+trÃna
+trÃnár
+trÃocha
+tú
+um
+ár
+é
+éis
+Ã
+ó
+ón
+óna
+ónár
Modified: lucene/dev/trunk/solr/example/solr/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/schema.xml?rev=1304836&r1=1304835&r2=1304836&view=diff
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/schema.xml (original)
+++ lucene/dev/trunk/solr/example/solr/conf/schema.xml Sat Mar 24 15:59:04 2012
@@ -617,6 +617,20 @@
</analyzer>
</fieldType>
+ <!-- Irish -->
+ <fieldType name="text_ga" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <!-- removes d', etc -->
+ <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ga.txt"/>
+ <!-- removes n-, etc. position increments is intentionally false! -->
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt" enablePositionIncrements="false"/>
+ <filter class="solr.IrishLowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ga.txt" enablePositionIncrements="true"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="Irish"/>
+ </analyzer>
+ </fieldType>
+
<!-- Galician -->
<fieldType name="text_gl" class="solr.TextField" positionIncrementGap="100">
<analyzer>