You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by st...@apache.org on 2011/12/09 16:54:40 UTC

svn commit: r1212486 - in /lucene/dev/trunk/solr/contrib/clustering/src: java/org/apache/solr/handler/clustering/carrot2/ test/org/apache/solr/handler/clustering/carrot2/

Author: stanislaw
Date: Fri Dec  9 15:54:39 2011
New Revision: 1212486

URL: http://svn.apache.org/viewvc?rev=1212486&view=rev
Log:
Tabs to spaces.

Modified:
    lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
    lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java
    lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java
    lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java
    lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
    lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java

Modified: lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java?rev=1212486&r1=1212485&r2=1212486&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java Fri Dec  9 15:54:39 2011
@@ -77,13 +77,13 @@ import com.google.common.io.Closeables;
  * @see "http://project.carrot2.org"
  */
 public class CarrotClusteringEngine extends SearchClusteringEngine {
-	private transient static Logger log = LoggerFactory
+  private transient static Logger log = LoggerFactory
           .getLogger(CarrotClusteringEngine.class);
 
-	/**
-	 * The subdirectory in Solr config dir to read customized Carrot2 resources from.
-	 */
-	private static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
+  /**
+   * The subdirectory in Solr config dir to read customized Carrot2 resources from.
+   */
+  private static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
 
   /**
    * Name of Carrot2 document's field containing Solr document's identifier.
@@ -227,8 +227,8 @@ public class CarrotClusteringEngine exte
     }
   }
 
-	@Override
-	@SuppressWarnings({ "unchecked", "rawtypes" })
+  @Override
+  @SuppressWarnings({ "unchecked", "rawtypes" })
   public String init(NamedList config, final SolrCore core) {
     String result = super.init(config, core);
     final SolrParams initParams = SolrParams.toSolrParams(config);
@@ -243,13 +243,13 @@ public class CarrotClusteringEngine exte
     // Additionally, we set a custom lexical resource factory for Carrot2 that
     // will use both Carrot2 default stop words as well as stop words from
     // the StopFilter defined on the field.
-		BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes)
-				.stemmerFactory(LuceneCarrot2StemmerFactory.class)
-				.tokenizerFactory(LuceneCarrot2TokenizerFactory.class)
-				.lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
+    BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes)
+        .stemmerFactory(LuceneCarrot2StemmerFactory.class)
+        .tokenizerFactory(LuceneCarrot2TokenizerFactory.class)
+        .lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
 
-		// Pass the schema to SolrStopwordsCarrot2LexicalDataFactory.
-		initAttributes.put("solrIndexSchema", core.getSchema());
+    // Pass the schema to SolrStopwordsCarrot2LexicalDataFactory.
+    initAttributes.put("solrIndexSchema", core.getSchema());
 
     // Customize Carrot2's resource lookup to first look for resources
     // using Solr's resource loader. If that fails, try loading from the classpath.
@@ -283,15 +283,15 @@ public class CarrotClusteringEngine exte
     HashSet<String> fields = Sets.newHashSet(getFieldsForClustering(sreq));
     fields.add(idFieldName);
     fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url"));
-		return fields;
+    return fields;
   }
 
-	/**
-	 * Returns the names of fields that will be delivering the actual
-	 * content for clustering. Currently, there are two such fields: document
-	 * title and document content.
-	 */
-	private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
+  /**
+   * Returns the names of fields that will be delivering the actual
+   * content for clustering. Currently, there are two such fields: document
+   * title and document content.
+   */
+  private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
     SolrParams solrParams = sreq.getParams();
 
     String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
@@ -301,7 +301,7 @@ public class CarrotClusteringEngine exte
               + " must not be blank.");
     }
     return Sets.newHashSet(titleField, snippetField);
-	}
+  }
 
   /**
    * Prepares Carrot2 documents for clustering.
@@ -362,7 +362,7 @@ public class CarrotClusteringEngine exte
         if (highlights != null && highlights.size() == 1) {//should only be one value given our setup
           //should only be one document with one field
           @SuppressWarnings("unchecked")
-					NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
+          NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
           String [] highlt = tmp.get(snippetField);
           if (highlt != null && highlt.length == 1) {
             snippet = highlt[0];

Modified: lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java?rev=1212486&r1=1212485&r2=1212486&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java Fri Dec  9 15:54:39 2011
@@ -50,192 +50,192 @@ import org.tartarus.snowball.ext.Turkish
  * in this class.
  */
 public class LuceneCarrot2StemmerFactory implements IStemmerFactory {
-	final static Logger logger = org.slf4j.LoggerFactory
-			.getLogger(LuceneCarrot2StemmerFactory.class);
+  final static Logger logger = org.slf4j.LoggerFactory
+      .getLogger(LuceneCarrot2StemmerFactory.class);
 
-	@Override
-	public IStemmer getStemmer(LanguageCode language) {
-		switch (language) {
-		case ARABIC:
-			return ArabicStemmerFactory.createStemmer();
-
-		case CHINESE_SIMPLIFIED:
-			return IdentityStemmer.INSTANCE;
-
-		default:
-			/*
-			 * For other languages, try to use snowball's stemming.
-			 */
-			return SnowballStemmerFactory.createStemmer(language);
-		}
-	}
-
-	/**
-	 * Factory of {@link IStemmer} implementations from the <code>snowball</code>
-	 * project.
-	 */
-	private final static class SnowballStemmerFactory {
-		/**
-		 * Static hard mapping from language codes to stemmer classes in Snowball.
-		 * This mapping is not dynamic because we want to keep the possibility to
-		 * obfuscate these classes.
-		 */
-		private static HashMap<LanguageCode, Class<? extends SnowballProgram>> snowballStemmerClasses;
-		static {
-			snowballStemmerClasses = new HashMap<LanguageCode, Class<? extends SnowballProgram>>();
-			snowballStemmerClasses.put(LanguageCode.DANISH, DanishStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.DUTCH, DutchStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.ENGLISH, EnglishStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.FINNISH, FinnishStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.FRENCH, FrenchStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.GERMAN, GermanStemmer.class);
-			snowballStemmerClasses
-					.put(LanguageCode.HUNGARIAN, HungarianStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.ITALIAN, ItalianStemmer.class);
-			snowballStemmerClasses
-					.put(LanguageCode.NORWEGIAN, NorwegianStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.PORTUGUESE,
-					PortugueseStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.ROMANIAN, RomanianStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.RUSSIAN, RussianStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.SPANISH, SpanishStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.SWEDISH, SwedishStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.TURKISH, TurkishStemmer.class);
-		}
-
-		/**
-		 * An adapter converting Snowball programs into {@link IStemmer} interface.
-		 */
-		private static class SnowballStemmerAdapter implements IStemmer {
-			private final SnowballProgram snowballStemmer;
-
-			public SnowballStemmerAdapter(SnowballProgram snowballStemmer) {
-				this.snowballStemmer = snowballStemmer;
-			}
-
-			public CharSequence stem(CharSequence word) {
-				snowballStemmer.setCurrent(word.toString());
-				if (snowballStemmer.stem()) {
-					return snowballStemmer.getCurrent();
-				} else {
-					return null;
-				}
-			}
-		}
-
-		/**
-		 * Create and return an {@link IStemmer} adapter for a
-		 * {@link SnowballProgram} for a given language code. An identity stemmer is
-		 * returned for unknown languages.
-		 */
-		public static IStemmer createStemmer(LanguageCode language) {
-			final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses
-					.get(language);
-
-			if (stemmerClazz == null) {
-				logger.warn("No Snowball stemmer class for: " + language.name()
-						+ ". Quality of clustering may be degraded.");
-				return IdentityStemmer.INSTANCE;
-			}
-
-			try {
-				return new SnowballStemmerAdapter(stemmerClazz.newInstance());
-			} catch (Exception e) {
-				logger.warn("Could not instantiate snowball stemmer"
-						+ " for language: " + language.name()
-						+ ". Quality of clustering may be degraded.", e);
-
-				return IdentityStemmer.INSTANCE;
-			}
-		}
-	}
-
-	/**
-	 * Factory of {@link IStemmer} implementations for the
-	 * {@link LanguageCode#ARABIC} language. Requires <code>lucene-contrib</code>
-	 * to be present in classpath, otherwise an empty (identity) stemmer is
-	 * returned.
-	 */
-	private static class ArabicStemmerFactory {
-		static {
-			try {
-				ReflectionUtils.classForName(ArabicStemmer.class.getName(), false);
-				ReflectionUtils.classForName(ArabicNormalizer.class.getName(), false);
-			} catch (ClassNotFoundException e) {
-				logger
-						.warn(
-								"Could not instantiate Lucene stemmer for Arabic, clustering quality "
-										+ "of Arabic content may be degraded. For best quality clusters, "
-										+ "make sure Lucene's Arabic analyzer JAR is in the classpath",
-								e);
-			}
-		}
-
-		/**
-		 * Adapter to lucene-contrib Arabic analyzers.
-		 */
-		private static class LuceneStemmerAdapter implements IStemmer {
-			private final org.apache.lucene.analysis.ar.ArabicStemmer delegate;
-			private final org.apache.lucene.analysis.ar.ArabicNormalizer normalizer;
-
-			private char[] buffer = new char[0];
-
-			private LuceneStemmerAdapter() throws Exception {
-				delegate = new org.apache.lucene.analysis.ar.ArabicStemmer();
-				normalizer = new org.apache.lucene.analysis.ar.ArabicNormalizer();
-			}
-
-			public CharSequence stem(CharSequence word) {
-				if (word.length() > buffer.length) {
-					buffer = new char[word.length()];
-				}
-
-				for (int i = 0; i < word.length(); i++) {
-					buffer[i] = word.charAt(i);
-				}
-
-				int newLen = normalizer.normalize(buffer, word.length());
-				newLen = delegate.stem(buffer, newLen);
-
-				if (newLen != word.length() || !equals(buffer, newLen, word)) {
-					return CharBuffer.wrap(buffer, 0, newLen);
-				}
-
-				// Same-same.
-				return null;
-			}
-
-			private boolean equals(char[] buffer, int len, CharSequence word) {
-				assert len == word.length();
-
-				for (int i = 0; i < len; i++) {
-					if (buffer[i] != word.charAt(i))
-						return false;
-				}
-
-				return true;
-			}
-		}
-
-		public static IStemmer createStemmer() {
-			try {
-				return new LuceneStemmerAdapter();
-			} catch (Throwable e) {
-				return IdentityStemmer.INSTANCE;
-			}
-		}
-	}
-
-	/**
-	 * An implementation of {@link IStemmer} that always returns <code>null</code>
-	 * which means no stemming.
-	 */
-	private static class IdentityStemmer implements IStemmer {
-		private final static IdentityStemmer INSTANCE = new IdentityStemmer();
-
-		@Override
-		public CharSequence stem(CharSequence word) {
-			return null;
-		}
-	}
+  @Override
+  public IStemmer getStemmer(LanguageCode language) {
+    switch (language) {
+    case ARABIC:
+      return ArabicStemmerFactory.createStemmer();
+
+    case CHINESE_SIMPLIFIED:
+      return IdentityStemmer.INSTANCE;
+
+    default:
+      /*
+       * For other languages, try to use snowball's stemming.
+       */
+      return SnowballStemmerFactory.createStemmer(language);
+    }
+  }
+
+  /**
+   * Factory of {@link IStemmer} implementations from the <code>snowball</code>
+   * project.
+   */
+  private final static class SnowballStemmerFactory {
+    /**
+     * Static hard mapping from language codes to stemmer classes in Snowball.
+     * This mapping is not dynamic because we want to keep the possibility to
+     * obfuscate these classes.
+     */
+    private static HashMap<LanguageCode, Class<? extends SnowballProgram>> snowballStemmerClasses;
+    static {
+      snowballStemmerClasses = new HashMap<LanguageCode, Class<? extends SnowballProgram>>();
+      snowballStemmerClasses.put(LanguageCode.DANISH, DanishStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.DUTCH, DutchStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.ENGLISH, EnglishStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.FINNISH, FinnishStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.FRENCH, FrenchStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.GERMAN, GermanStemmer.class);
+      snowballStemmerClasses
+          .put(LanguageCode.HUNGARIAN, HungarianStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.ITALIAN, ItalianStemmer.class);
+      snowballStemmerClasses
+          .put(LanguageCode.NORWEGIAN, NorwegianStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.PORTUGUESE,
+          PortugueseStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.ROMANIAN, RomanianStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.RUSSIAN, RussianStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.SPANISH, SpanishStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.SWEDISH, SwedishStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.TURKISH, TurkishStemmer.class);
+    }
+
+    /**
+     * An adapter converting Snowball programs into {@link IStemmer} interface.
+     */
+    private static class SnowballStemmerAdapter implements IStemmer {
+      private final SnowballProgram snowballStemmer;
+
+      public SnowballStemmerAdapter(SnowballProgram snowballStemmer) {
+        this.snowballStemmer = snowballStemmer;
+      }
+
+      public CharSequence stem(CharSequence word) {
+        snowballStemmer.setCurrent(word.toString());
+        if (snowballStemmer.stem()) {
+          return snowballStemmer.getCurrent();
+        } else {
+          return null;
+        }
+      }
+    }
+
+    /**
+     * Create and return an {@link IStemmer} adapter for a
+     * {@link SnowballProgram} for a given language code. An identity stemmer is
+     * returned for unknown languages.
+     */
+    public static IStemmer createStemmer(LanguageCode language) {
+      final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses
+          .get(language);
+
+      if (stemmerClazz == null) {
+        logger.warn("No Snowball stemmer class for: " + language.name()
+            + ". Quality of clustering may be degraded.");
+        return IdentityStemmer.INSTANCE;
+      }
+
+      try {
+        return new SnowballStemmerAdapter(stemmerClazz.newInstance());
+      } catch (Exception e) {
+        logger.warn("Could not instantiate snowball stemmer"
+            + " for language: " + language.name()
+            + ". Quality of clustering may be degraded.", e);
+
+        return IdentityStemmer.INSTANCE;
+      }
+    }
+  }
+
+  /**
+   * Factory of {@link IStemmer} implementations for the
+   * {@link LanguageCode#ARABIC} language. Requires <code>lucene-contrib</code>
+   * to be present in classpath, otherwise an empty (identity) stemmer is
+   * returned.
+   */
+  private static class ArabicStemmerFactory {
+    static {
+      try {
+        ReflectionUtils.classForName(ArabicStemmer.class.getName(), false);
+        ReflectionUtils.classForName(ArabicNormalizer.class.getName(), false);
+      } catch (ClassNotFoundException e) {
+        logger
+            .warn(
+                "Could not instantiate Lucene stemmer for Arabic, clustering quality "
+                    + "of Arabic content may be degraded. For best quality clusters, "
+                    + "make sure Lucene's Arabic analyzer JAR is in the classpath",
+                e);
+      }
+    }
+
+    /**
+     * Adapter to lucene-contrib Arabic analyzers.
+     */
+    private static class LuceneStemmerAdapter implements IStemmer {
+      private final org.apache.lucene.analysis.ar.ArabicStemmer delegate;
+      private final org.apache.lucene.analysis.ar.ArabicNormalizer normalizer;
+
+      private char[] buffer = new char[0];
+
+      private LuceneStemmerAdapter() throws Exception {
+        delegate = new org.apache.lucene.analysis.ar.ArabicStemmer();
+        normalizer = new org.apache.lucene.analysis.ar.ArabicNormalizer();
+      }
+
+      public CharSequence stem(CharSequence word) {
+        if (word.length() > buffer.length) {
+          buffer = new char[word.length()];
+        }
+
+        for (int i = 0; i < word.length(); i++) {
+          buffer[i] = word.charAt(i);
+        }
+
+        int newLen = normalizer.normalize(buffer, word.length());
+        newLen = delegate.stem(buffer, newLen);
+
+        if (newLen != word.length() || !equals(buffer, newLen, word)) {
+          return CharBuffer.wrap(buffer, 0, newLen);
+        }
+
+        // Same-same.
+        return null;
+      }
+
+      private boolean equals(char[] buffer, int len, CharSequence word) {
+        assert len == word.length();
+
+        for (int i = 0; i < len; i++) {
+          if (buffer[i] != word.charAt(i))
+            return false;
+        }
+
+        return true;
+      }
+    }
+
+    public static IStemmer createStemmer() {
+      try {
+        return new LuceneStemmerAdapter();
+      } catch (Throwable e) {
+        return IdentityStemmer.INSTANCE;
+      }
+    }
+  }
+
+  /**
+   * An implementation of {@link IStemmer} that always returns <code>null</code>
+   * which means no stemming.
+   */
+  private static class IdentityStemmer implements IStemmer {
+    private final static IdentityStemmer INSTANCE = new IdentityStemmer();
+
+    @Override
+    public CharSequence stem(CharSequence word) {
+      return null;
+    }
+  }
 }

Modified: lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java?rev=1212486&r1=1212485&r2=1212486&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java Fri Dec  9 15:54:39 2011
@@ -40,117 +40,117 @@ import org.slf4j.Logger;
  * Lucene APIs need to change, the changes can be made in this class.
  */
 public class LuceneCarrot2TokenizerFactory implements ITokenizerFactory {
-	final static Logger logger = org.slf4j.LoggerFactory
-			.getLogger(LuceneCarrot2TokenizerFactory.class);
+  final static Logger logger = org.slf4j.LoggerFactory
+      .getLogger(LuceneCarrot2TokenizerFactory.class);
 
-	@Override
-	public ITokenizer getTokenizer(LanguageCode language) {
-		switch (language) {
-		case CHINESE_SIMPLIFIED:
-			return ChineseTokenizerFactory.createTokenizer();
-
-			/*
-			 * We use our own analyzer for Arabic. Lucene's version has special
-			 * support for Nonspacing-Mark characters (see
-			 * http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
-			 * have them included as letters in the parser.
-			 */
-		case ARABIC:
-			// Intentional fall-through.
-
-		default:
-			return new ExtendedWhitespaceTokenizer();
-		}
-	}
-
-	/**
-	 * Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
-	 * {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
-	 * factory will fall back to the default white space tokenizer.
-	 */
-	private static final class ChineseTokenizerFactory {
-		static {
-			try {
-				ReflectionUtils.classForName(
-						"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
-				ReflectionUtils.classForName(
-						"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
-			} catch (Throwable e) {
-				logger
-						.warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
-								+ "of Chinese content may be degraded. For best quality clusters, "
-								+ "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
-			}
-		}
-
-		static ITokenizer createTokenizer() {
-			try {
-				return new ChineseTokenizer();
-			} catch (Throwable e) {
-				return new ExtendedWhitespaceTokenizer();
-			}
-		}
-
-		private final static class ChineseTokenizer implements ITokenizer {
-			private final static Pattern numeric = Pattern
-					.compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
-
-			private Tokenizer sentenceTokenizer;
-			private TokenStream wordTokenFilter;
-			private CharTermAttribute term = null;
-
-			private final MutableCharArray tempCharSequence;
-			private final Class<?> tokenFilterClass;
-
-			private ChineseTokenizer() throws Exception {
-				this.tempCharSequence = new MutableCharArray(new char[0]);
-
-				// As Smart Chinese is not available during compile time,
-				// we need to resort to reflection.
-				final Class<?> tokenizerClass = ReflectionUtils.classForName(
-						"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
-				this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
-						Reader.class).newInstance((Reader) null);
-				this.tokenFilterClass = ReflectionUtils.classForName(
-						"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
-			}
-
-			public short nextToken() throws IOException {
-				final boolean hasNextToken = wordTokenFilter.incrementToken();
-				if (hasNextToken) {
-					short flags = 0;
-					final char[] image = term.buffer();
-					final int length = term.length();
-					tempCharSequence.reset(image, 0, length);
-					if (length == 1 && image[0] == ',') {
-						// ChineseTokenizer seems to convert all punctuation to ','
-						// characters
-						flags = ITokenizer.TT_PUNCTUATION;
-					} else if (numeric.matcher(tempCharSequence).matches()) {
-						flags = ITokenizer.TT_NUMERIC;
-					} else {
-						flags = ITokenizer.TT_TERM;
-					}
-					return flags;
-				}
-
-				return ITokenizer.TT_EOF;
-			}
-
-			public void setTermBuffer(MutableCharArray array) {
-				array.reset(term.buffer(), 0, term.length());
-			}
-
-			public void reset(Reader input) throws IOException {
-				try {
-					sentenceTokenizer.reset(input);
-					wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
-							TokenStream.class).newInstance(sentenceTokenizer);
+  @Override
+  public ITokenizer getTokenizer(LanguageCode language) {
+    switch (language) {
+    case CHINESE_SIMPLIFIED:
+      return ChineseTokenizerFactory.createTokenizer();
+
+      /*
+       * We use our own analyzer for Arabic. Lucene's version has special
+       * support for Nonspacing-Mark characters (see
+       * http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
+       * have them included as letters in the parser.
+       */
+    case ARABIC:
+      // Intentional fall-through.
+
+    default:
+      return new ExtendedWhitespaceTokenizer();
+    }
+  }
+
+  /**
+   * Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
+   * {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
+   * factory will fall back to the default white space tokenizer.
+   */
+  private static final class ChineseTokenizerFactory {
+    static {
+      try {
+        ReflectionUtils.classForName(
+            "org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
+        ReflectionUtils.classForName(
+            "org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
+      } catch (Throwable e) {
+        logger
+            .warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
+                + "of Chinese content may be degraded. For best quality clusters, "
+                + "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
+      }
+    }
+
+    static ITokenizer createTokenizer() {
+      try {
+        return new ChineseTokenizer();
+      } catch (Throwable e) {
+        return new ExtendedWhitespaceTokenizer();
+      }
+    }
+
+    private final static class ChineseTokenizer implements ITokenizer {
+      private final static Pattern numeric = Pattern
+          .compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
+
+      private Tokenizer sentenceTokenizer;
+      private TokenStream wordTokenFilter;
+      private CharTermAttribute term = null;
+
+      private final MutableCharArray tempCharSequence;
+      private final Class<?> tokenFilterClass;
+
+      private ChineseTokenizer() throws Exception {
+        this.tempCharSequence = new MutableCharArray(new char[0]);
+
+        // As Smart Chinese is not available during compile time,
+        // we need to resort to reflection.
+        final Class<?> tokenizerClass = ReflectionUtils.classForName(
+            "org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
+        this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
+            Reader.class).newInstance((Reader) null);
+        this.tokenFilterClass = ReflectionUtils.classForName(
+            "org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
+      }
+
+      public short nextToken() throws IOException {
+        final boolean hasNextToken = wordTokenFilter.incrementToken();
+        if (hasNextToken) {
+          short flags = 0;
+          final char[] image = term.buffer();
+          final int length = term.length();
+          tempCharSequence.reset(image, 0, length);
+          if (length == 1 && image[0] == ',') {
+            // ChineseTokenizer seems to convert all punctuation to ','
+            // characters
+            flags = ITokenizer.TT_PUNCTUATION;
+          } else if (numeric.matcher(tempCharSequence).matches()) {
+            flags = ITokenizer.TT_NUMERIC;
+          } else {
+            flags = ITokenizer.TT_TERM;
+          }
+          return flags;
+        }
+
+        return ITokenizer.TT_EOF;
+      }
+
+      public void setTermBuffer(MutableCharArray array) {
+        array.reset(term.buffer(), 0, term.length());
+      }
+
+      public void reset(Reader input) throws IOException {
+        try {
+          sentenceTokenizer.reset(input);
+          wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
+              TokenStream.class).newInstance(sentenceTokenizer);
           term = wordTokenFilter.addAttribute(CharTermAttribute.class);
-				} catch (Exception e) {
-					throw ExceptionUtils.wrapAsRuntimeException(e);
-				}
-			}
-		}
-	}
+        } catch (Exception e) {
+          throw ExceptionUtils.wrapAsRuntimeException(e);
+        }
+      }
+    }
+  }
 }

Modified: lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java?rev=1212486&r1=1212485&r2=1212486&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java Fri Dec  9 15:54:39 2011
@@ -53,89 +53,89 @@ import com.google.common.collect.Multima
  */
 @Bindable
 public class SolrStopwordsCarrot2LexicalDataFactory implements
-		ILexicalDataFactory {
-	final static Logger logger = org.slf4j.LoggerFactory
-			.getLogger(SolrStopwordsCarrot2LexicalDataFactory.class);
-
-	@Init
-	@Input
-	@Attribute(key = "solrIndexSchema")
-	private IndexSchema schema;
-
-	@Processing
-	@Input
-	@Attribute(key = "solrFieldNames")
-	private Set<String> fieldNames;
-
-	/**
-	 * A lazily-built cache of stop words per field.
-	 */
-	private Multimap<String, CharArraySet> solrStopWords = HashMultimap.create();
-
-	/**
-	 * Carrot2's default lexical resources to use in addition to Solr's stop
-	 * words.
-	 */
-	private DefaultLexicalDataFactory carrot2LexicalDataFactory = new DefaultLexicalDataFactory();
-
-	/**
-	 * Obtains stop words for a field from the associated
-	 * {@link StopFilterFactory}, if any.
-	 */
-	private Collection<CharArraySet> getSolrStopWordsForField(String fieldName) {
-		// No need to synchronize here, Carrot2 ensures that instances
-		// of this class are not used by multiple threads at a time.
-		if (!solrStopWords.containsKey(fieldName)) {
-			final Analyzer fieldAnalyzer = schema.getFieldType(fieldName)
-					.getAnalyzer();
-			if (fieldAnalyzer instanceof TokenizerChain) {
-				final TokenFilterFactory[] filterFactories = ((TokenizerChain) fieldAnalyzer)
-						.getTokenFilterFactories();
-				for (TokenFilterFactory factory : filterFactories) {
-					if (factory instanceof StopFilterFactory) {
-						// StopFilterFactory holds the stop words in a CharArraySet, but
-						// the getStopWords() method returns a Set<?>, so we need to cast.
-						solrStopWords.put(fieldName,
-								(CharArraySet) ((StopFilterFactory) factory).getStopWords());
-					}
-
-					if (factory instanceof CommonGramsFilterFactory) {
-						solrStopWords.put(fieldName,
-								(CharArraySet) ((CommonGramsFilterFactory) factory)
-										.getCommonWords());
-					}
-				}
-			}
-		}
-		return solrStopWords.get(fieldName);
-	}
-
-	@Override
-	public ILexicalData getLexicalData(LanguageCode languageCode) {
-		final ILexicalData carrot2LexicalData = carrot2LexicalDataFactory
-				.getLexicalData(languageCode);
-
-		return new ILexicalData() {
-			@Override
-			public boolean isStopLabel(CharSequence word) {
-				// Nothing in Solr maps to the concept of a stop label,
-				// so return Carrot2's default here.
-				return carrot2LexicalData.isStopLabel(word);
-			}
-
-			@Override
-			public boolean isCommonWord(MutableCharArray word) {
-				// Loop over the fields involved in clustering first
-				for (String fieldName : fieldNames) {
-					for (CharArraySet stopWords : getSolrStopWordsForField(fieldName)) {
-						if (stopWords.contains(word)) {
-							return true;
-						}
-					}
-				}
-				// Check default Carrot2 stop words too
-				return carrot2LexicalData.isCommonWord(word);
-			}
-		};
-	}
+    ILexicalDataFactory {
+  final static Logger logger = org.slf4j.LoggerFactory
+      .getLogger(SolrStopwordsCarrot2LexicalDataFactory.class);
+
+  @Init
+  @Input
+  @Attribute(key = "solrIndexSchema")
+  private IndexSchema schema;
+
+  @Processing
+  @Input
+  @Attribute(key = "solrFieldNames")
+  private Set<String> fieldNames;
+
+  /**
+   * A lazily-built cache of stop words per field.
+   */
+  private Multimap<String, CharArraySet> solrStopWords = HashMultimap.create();
+
+  /**
+   * Carrot2's default lexical resources to use in addition to Solr's stop
+   * words.
+   */
+  private DefaultLexicalDataFactory carrot2LexicalDataFactory = new DefaultLexicalDataFactory();
+
+  /**
+   * Obtains stop words for a field from the associated
+   * {@link StopFilterFactory}, if any.
+   */
+  private Collection<CharArraySet> getSolrStopWordsForField(String fieldName) {
+    // No need to synchronize here, Carrot2 ensures that instances
+    // of this class are not used by multiple threads at a time.
+    if (!solrStopWords.containsKey(fieldName)) {
+      final Analyzer fieldAnalyzer = schema.getFieldType(fieldName)
+          .getAnalyzer();
+      if (fieldAnalyzer instanceof TokenizerChain) {
+        final TokenFilterFactory[] filterFactories = ((TokenizerChain) fieldAnalyzer)
+            .getTokenFilterFactories();
+        for (TokenFilterFactory factory : filterFactories) {
+          if (factory instanceof StopFilterFactory) {
+            // StopFilterFactory holds the stop words in a CharArraySet, but
+            // the getStopWords() method returns a Set<?>, so we need to cast.
+            solrStopWords.put(fieldName,
+                (CharArraySet) ((StopFilterFactory) factory).getStopWords());
+          }
+
+          if (factory instanceof CommonGramsFilterFactory) {
+            solrStopWords.put(fieldName,
+                (CharArraySet) ((CommonGramsFilterFactory) factory)
+                    .getCommonWords());
+          }
+        }
+      }
+    }
+    return solrStopWords.get(fieldName);
+  }
+
+  @Override
+  public ILexicalData getLexicalData(LanguageCode languageCode) {
+    final ILexicalData carrot2LexicalData = carrot2LexicalDataFactory
+        .getLexicalData(languageCode);
+
+    return new ILexicalData() {
+      @Override
+      public boolean isStopLabel(CharSequence word) {
+        // Nothing in Solr maps to the concept of a stop label,
+        // so return Carrot2's default here.
+        return carrot2LexicalData.isStopLabel(word);
+      }
+
+      @Override
+      public boolean isCommonWord(MutableCharArray word) {
+        // Loop over the fields involved in clustering first
+        for (String fieldName : fieldNames) {
+          for (CharArraySet stopWords : getSolrStopWordsForField(fieldName)) {
+            if (stopWords.contains(word)) {
+              return true;
+            }
+          }
+        }
+        // Check default Carrot2 stop words too
+        return carrot2LexicalData.isCommonWord(word);
+      }
+    };
+  }
 }

Modified: lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java?rev=1212486&r1=1212485&r2=1212486&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java Fri Dec  9 15:54:39 2011
@@ -50,10 +50,10 @@ import com.google.common.collect.Immutab
 public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
   @Test
   public void testCarrotLingo() throws Exception {
-  	// Note: the expected number of clusters may change after upgrading Carrot2
-  	// due to e.g. internal improvements or tuning of Carrot2 clustering.
+    // Note: the expected number of clusters may change after upgrading Carrot2
+    // due to e.g. internal improvements or tuning of Carrot2 clustering.
     final int expectedNumClusters = 10;
-		checkEngine(getClusteringEngine("default"), expectedNumClusters);
+    checkEngine(getClusteringEngine("default"), expectedNumClusters);
   }
 
   @Test
@@ -169,66 +169,66 @@ public class CarrotClusteringEngineTest 
             params), 1, 3, 0);
   }
 
-	@Test
-	public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception {
-		checkLexicalResourcesFromSolrConfig("lexical-resource-check",
-				"online,customsolrstopword,customsolrstoplabel");
-	}
-
-	@Test
-	public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception {
-		checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir",
-				"online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
-	}
-
-	private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
-			throws IOException {
-		ModifiableSolrParams params = new ModifiableSolrParams();
-		params.set("merge-resources", false);
-		params.set(AttributeUtils.getKey(
-				LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
-				wordsToCheck);
-
-		// "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in
-		// stoplabels.mt, so we're expecting only one cluster with label "online".
-		final List<NamedList<Object>> clusters = checkEngine(
-				getClusteringEngine(engineName), 1, params);
-		assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
-	}
-
-	@Test
-	public void solrStopWordsUsedInCarrot2Clustering() throws Exception {
-		ModifiableSolrParams params = new ModifiableSolrParams();
-		params.set("merge-resources", false);
-		params.set(AttributeUtils.getKey(
-				LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
-		"online,solrownstopword");
-
-		// "solrownstopword" is in stopwords.txt, so we're expecting
-		// only one cluster with label "online".
-		final List<NamedList<Object>> clusters = checkEngine(
-				getClusteringEngine("lexical-resource-check"), 1, params);
-		assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
-	}
-
-	@Test
-	public void solrStopWordsNotDefinedOnAFieldForClustering() throws Exception {
-		ModifiableSolrParams params = new ModifiableSolrParams();
-		// Force string fields to be used for clustering. Does not make sense
-		// in a real word, but does the job in the test.
-		params.set(CarrotParams.TITLE_FIELD_NAME, "url");
-		params.set(CarrotParams.SNIPPET_FIELD_NAME, "url");
-		params.set("merge-resources", false);
-		params.set(AttributeUtils.getKey(
-				LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
-		"online,solrownstopword");
-
-		final List<NamedList<Object>> clusters = checkEngine(
-				getClusteringEngine("lexical-resource-check"), 2, params);
-		assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0)));
-		assertEquals(ImmutableList.of("solrownstopword"),
-				getLabels(clusters.get(1)));
-	}
+  @Test
+  public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception {
+    checkLexicalResourcesFromSolrConfig("lexical-resource-check",
+        "online,customsolrstopword,customsolrstoplabel");
+  }
+
+  @Test
+  public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception {
+    checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir",
+        "online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
+  }
+
+  private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
+      throws IOException {
+    ModifiableSolrParams params = new ModifiableSolrParams();
+    params.set("merge-resources", false);
+    params.set(AttributeUtils.getKey(
+        LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+        wordsToCheck);
+
+    // "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in
+    // stoplabels.mt, so we're expecting only one cluster with label "online".
+    final List<NamedList<Object>> clusters = checkEngine(
+        getClusteringEngine(engineName), 1, params);
+    assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
+  }
+
+  @Test
+  public void solrStopWordsUsedInCarrot2Clustering() throws Exception {
+    ModifiableSolrParams params = new ModifiableSolrParams();
+    params.set("merge-resources", false);
+    params.set(AttributeUtils.getKey(
+        LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+    "online,solrownstopword");
+
+    // "solrownstopword" is in stopwords.txt, so we're expecting
+    // only one cluster with label "online".
+    final List<NamedList<Object>> clusters = checkEngine(
+        getClusteringEngine("lexical-resource-check"), 1, params);
+    assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
+  }
+
+  @Test
+  public void solrStopWordsNotDefinedOnAFieldForClustering() throws Exception {
+    ModifiableSolrParams params = new ModifiableSolrParams();
+    // Force string fields to be used for clustering. Does not make sense
+    // in a real word, but does the job in the test.
+    params.set(CarrotParams.TITLE_FIELD_NAME, "url");
+    params.set(CarrotParams.SNIPPET_FIELD_NAME, "url");
+    params.set("merge-resources", false);
+    params.set(AttributeUtils.getKey(
+        LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+    "online,solrownstopword");
+
+    final List<NamedList<Object>> clusters = checkEngine(
+        getClusteringEngine("lexical-resource-check"), 2, params);
+    assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0)));
+    assertEquals(ImmutableList.of("solrownstopword"),
+        getLabels(clusters.get(1)));
+  }
 
   private CarrotClusteringEngine getClusteringEngine(String engineName) {
     ClusteringComponent comp = (ClusteringComponent) h.getCore()
@@ -273,7 +273,7 @@ public class CarrotClusteringEngineTest 
       SolrDocumentList solrDocList = SolrPluginUtils.docListToSolrDocumentList( docList, searcher, engine.getFieldsToLoad(req), docIds );
 
       @SuppressWarnings("unchecked")
-			List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req);
+      List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req);
       req.close();
       assertEquals("number of clusters: " + results, expectedNumClusters, results.size());
       checkClusters(results, false);
@@ -331,26 +331,26 @@ public class CarrotClusteringEngineTest 
     }
   }
 
-	@SuppressWarnings("unchecked")
-	private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) {
-		return (List<NamedList<Object>>) cluster.get("clusters");
-	}
-
-	@SuppressWarnings("unchecked")
-	private List<String> getLabels(NamedList<Object> cluster) {
-		return (List<String>) cluster.get("labels");
-	}
-
-	private Double getScore(NamedList<Object> cluster) {
-	  return (Double) cluster.get("score");
-	}
-
-	private Boolean isOtherTopics(NamedList<Object> cluster) {
-	  return (Boolean)cluster.get("other-topics");
-	}
-
-	@SuppressWarnings("unchecked")
-	private List<Object> getDocs(NamedList<Object> cluster) {
-		return (List<Object>) cluster.get("docs");
-	}
+  @SuppressWarnings("unchecked")
+  private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) {
+    return (List<NamedList<Object>>) cluster.get("clusters");
+  }
+
+  @SuppressWarnings("unchecked")
+  private List<String> getLabels(NamedList<Object> cluster) {
+    return (List<String>) cluster.get("labels");
+  }
+
+  private Double getScore(NamedList<Object> cluster) {
+    return (Double) cluster.get("score");
+  }
+
+  private Boolean isOtherTopics(NamedList<Object> cluster) {
+    return (Boolean)cluster.get("other-topics");
+  }
+
+  @SuppressWarnings("unchecked")
+  private List<Object> getDocs(NamedList<Object> cluster) {
+    return (List<Object>) cluster.get("docs");
+  }
 }

Modified: lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java?rev=1212486&r1=1212485&r2=1212486&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java Fri Dec  9 15:54:39 2011
@@ -25,9 +25,7 @@ import org.carrot2.core.ProcessingCompon
 import org.carrot2.core.ProcessingException;
 import org.carrot2.core.attribute.AttributeNames;
 import org.carrot2.core.attribute.Processing;
-import org.carrot2.text.linguistic.DefaultLexicalDataFactory;
 import org.carrot2.text.linguistic.ILexicalData;
-import org.carrot2.text.linguistic.ILexicalDataFactory;
 import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline;
 import org.carrot2.text.util.MutableCharArray;
 import org.carrot2.util.attribute.Attribute;
@@ -46,37 +44,37 @@ import com.google.common.collect.Lists;
  */
 @Bindable(prefix = "LexicalResourcesCheckClusteringAlgorithm")
 public class LexicalResourcesCheckClusteringAlgorithm extends
-		ProcessingComponentBase implements IClusteringAlgorithm {
+    ProcessingComponentBase implements IClusteringAlgorithm {
 
-	@Output
-	@Processing
-	@Attribute(key = AttributeNames.CLUSTERS)
-	private List<Cluster> clusters;
-
-	@Input
-	@Processing
-	@Attribute
-	private String wordsToCheck;
-
-	private BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
-
-	@Override
-	public void process() throws ProcessingException {
-		clusters = Lists.newArrayList();
-		if (wordsToCheck == null) {
-			return;
-		}
-
-		// Test with Maltese so that the English clustering performed in other tests
-		// is not affected by the test stopwords and stoplabels.
-		ILexicalData lexicalData = preprocessing.lexicalDataFactory
-				.getLexicalData(LanguageCode.MALTESE);
-
-		for (String word : wordsToCheck.split(",")) {
-			if (!lexicalData.isCommonWord(new MutableCharArray(word))
-					&& !lexicalData.isStopLabel(word)) {
-				clusters.add(new Cluster(word));
-			}
-		}
-	}
+  @Output
+  @Processing
+  @Attribute(key = AttributeNames.CLUSTERS)
+  private List<Cluster> clusters;
+
+  @Input
+  @Processing
+  @Attribute
+  private String wordsToCheck;
+
+  private BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
+
+  @Override
+  public void process() throws ProcessingException {
+    clusters = Lists.newArrayList();
+    if (wordsToCheck == null) {
+      return;
+    }
+
+    // Test with Maltese so that the English clustering performed in other tests
+    // is not affected by the test stopwords and stoplabels.
+    ILexicalData lexicalData = preprocessing.lexicalDataFactory
+        .getLexicalData(LanguageCode.MALTESE);
+
+    for (String word : wordsToCheck.split(",")) {
+      if (!lexicalData.isCommonWord(new MutableCharArray(word))
+          && !lexicalData.isStopLabel(word)) {
+        clusters.add(new Cluster(word));
+      }
+    }
+  }
 }