You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by st...@apache.org on 2011/12/09 16:54:40 UTC
svn commit: r1212486 - in /lucene/dev/trunk/solr/contrib/clustering/src:
java/org/apache/solr/handler/clustering/carrot2/
test/org/apache/solr/handler/clustering/carrot2/
Author: stanislaw
Date: Fri Dec 9 15:54:39 2011
New Revision: 1212486
URL: http://svn.apache.org/viewvc?rev=1212486&view=rev
Log:
Tabs to spaces.
Modified:
lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java
lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java
lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java
lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java
Modified: lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java?rev=1212486&r1=1212485&r2=1212486&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java Fri Dec 9 15:54:39 2011
@@ -77,13 +77,13 @@ import com.google.common.io.Closeables;
* @see "http://project.carrot2.org"
*/
public class CarrotClusteringEngine extends SearchClusteringEngine {
- private transient static Logger log = LoggerFactory
+ private transient static Logger log = LoggerFactory
.getLogger(CarrotClusteringEngine.class);
- /**
- * The subdirectory in Solr config dir to read customized Carrot2 resources from.
- */
- private static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
+ /**
+ * The subdirectory in Solr config dir to read customized Carrot2 resources from.
+ */
+ private static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
/**
* Name of Carrot2 document's field containing Solr document's identifier.
@@ -227,8 +227,8 @@ public class CarrotClusteringEngine exte
}
}
- @Override
- @SuppressWarnings({ "unchecked", "rawtypes" })
+ @Override
+ @SuppressWarnings({ "unchecked", "rawtypes" })
public String init(NamedList config, final SolrCore core) {
String result = super.init(config, core);
final SolrParams initParams = SolrParams.toSolrParams(config);
@@ -243,13 +243,13 @@ public class CarrotClusteringEngine exte
// Additionally, we set a custom lexical resource factory for Carrot2 that
// will use both Carrot2 default stop words as well as stop words from
// the StopFilter defined on the field.
- BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes)
- .stemmerFactory(LuceneCarrot2StemmerFactory.class)
- .tokenizerFactory(LuceneCarrot2TokenizerFactory.class)
- .lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
+ BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes)
+ .stemmerFactory(LuceneCarrot2StemmerFactory.class)
+ .tokenizerFactory(LuceneCarrot2TokenizerFactory.class)
+ .lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
- // Pass the schema to SolrStopwordsCarrot2LexicalDataFactory.
- initAttributes.put("solrIndexSchema", core.getSchema());
+ // Pass the schema to SolrStopwordsCarrot2LexicalDataFactory.
+ initAttributes.put("solrIndexSchema", core.getSchema());
// Customize Carrot2's resource lookup to first look for resources
// using Solr's resource loader. If that fails, try loading from the classpath.
@@ -283,15 +283,15 @@ public class CarrotClusteringEngine exte
HashSet<String> fields = Sets.newHashSet(getFieldsForClustering(sreq));
fields.add(idFieldName);
fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url"));
- return fields;
+ return fields;
}
- /**
- * Returns the names of fields that will be delivering the actual
- * content for clustering. Currently, there are two such fields: document
- * title and document content.
- */
- private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
+ /**
+ * Returns the names of fields that will be delivering the actual
+ * content for clustering. Currently, there are two such fields: document
+ * title and document content.
+ */
+ private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
SolrParams solrParams = sreq.getParams();
String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
@@ -301,7 +301,7 @@ public class CarrotClusteringEngine exte
+ " must not be blank.");
}
return Sets.newHashSet(titleField, snippetField);
- }
+ }
/**
* Prepares Carrot2 documents for clustering.
@@ -362,7 +362,7 @@ public class CarrotClusteringEngine exte
if (highlights != null && highlights.size() == 1) {//should only be one value given our setup
//should only be one document with one field
@SuppressWarnings("unchecked")
- NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
+ NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
String [] highlt = tmp.get(snippetField);
if (highlt != null && highlt.length == 1) {
snippet = highlt[0];
Modified: lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java?rev=1212486&r1=1212485&r2=1212486&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java Fri Dec 9 15:54:39 2011
@@ -50,192 +50,192 @@ import org.tartarus.snowball.ext.Turkish
* in this class.
*/
public class LuceneCarrot2StemmerFactory implements IStemmerFactory {
- final static Logger logger = org.slf4j.LoggerFactory
- .getLogger(LuceneCarrot2StemmerFactory.class);
+ final static Logger logger = org.slf4j.LoggerFactory
+ .getLogger(LuceneCarrot2StemmerFactory.class);
- @Override
- public IStemmer getStemmer(LanguageCode language) {
- switch (language) {
- case ARABIC:
- return ArabicStemmerFactory.createStemmer();
-
- case CHINESE_SIMPLIFIED:
- return IdentityStemmer.INSTANCE;
-
- default:
- /*
- * For other languages, try to use snowball's stemming.
- */
- return SnowballStemmerFactory.createStemmer(language);
- }
- }
-
- /**
- * Factory of {@link IStemmer} implementations from the <code>snowball</code>
- * project.
- */
- private final static class SnowballStemmerFactory {
- /**
- * Static hard mapping from language codes to stemmer classes in Snowball.
- * This mapping is not dynamic because we want to keep the possibility to
- * obfuscate these classes.
- */
- private static HashMap<LanguageCode, Class<? extends SnowballProgram>> snowballStemmerClasses;
- static {
- snowballStemmerClasses = new HashMap<LanguageCode, Class<? extends SnowballProgram>>();
- snowballStemmerClasses.put(LanguageCode.DANISH, DanishStemmer.class);
- snowballStemmerClasses.put(LanguageCode.DUTCH, DutchStemmer.class);
- snowballStemmerClasses.put(LanguageCode.ENGLISH, EnglishStemmer.class);
- snowballStemmerClasses.put(LanguageCode.FINNISH, FinnishStemmer.class);
- snowballStemmerClasses.put(LanguageCode.FRENCH, FrenchStemmer.class);
- snowballStemmerClasses.put(LanguageCode.GERMAN, GermanStemmer.class);
- snowballStemmerClasses
- .put(LanguageCode.HUNGARIAN, HungarianStemmer.class);
- snowballStemmerClasses.put(LanguageCode.ITALIAN, ItalianStemmer.class);
- snowballStemmerClasses
- .put(LanguageCode.NORWEGIAN, NorwegianStemmer.class);
- snowballStemmerClasses.put(LanguageCode.PORTUGUESE,
- PortugueseStemmer.class);
- snowballStemmerClasses.put(LanguageCode.ROMANIAN, RomanianStemmer.class);
- snowballStemmerClasses.put(LanguageCode.RUSSIAN, RussianStemmer.class);
- snowballStemmerClasses.put(LanguageCode.SPANISH, SpanishStemmer.class);
- snowballStemmerClasses.put(LanguageCode.SWEDISH, SwedishStemmer.class);
- snowballStemmerClasses.put(LanguageCode.TURKISH, TurkishStemmer.class);
- }
-
- /**
- * An adapter converting Snowball programs into {@link IStemmer} interface.
- */
- private static class SnowballStemmerAdapter implements IStemmer {
- private final SnowballProgram snowballStemmer;
-
- public SnowballStemmerAdapter(SnowballProgram snowballStemmer) {
- this.snowballStemmer = snowballStemmer;
- }
-
- public CharSequence stem(CharSequence word) {
- snowballStemmer.setCurrent(word.toString());
- if (snowballStemmer.stem()) {
- return snowballStemmer.getCurrent();
- } else {
- return null;
- }
- }
- }
-
- /**
- * Create and return an {@link IStemmer} adapter for a
- * {@link SnowballProgram} for a given language code. An identity stemmer is
- * returned for unknown languages.
- */
- public static IStemmer createStemmer(LanguageCode language) {
- final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses
- .get(language);
-
- if (stemmerClazz == null) {
- logger.warn("No Snowball stemmer class for: " + language.name()
- + ". Quality of clustering may be degraded.");
- return IdentityStemmer.INSTANCE;
- }
-
- try {
- return new SnowballStemmerAdapter(stemmerClazz.newInstance());
- } catch (Exception e) {
- logger.warn("Could not instantiate snowball stemmer"
- + " for language: " + language.name()
- + ". Quality of clustering may be degraded.", e);
-
- return IdentityStemmer.INSTANCE;
- }
- }
- }
-
- /**
- * Factory of {@link IStemmer} implementations for the
- * {@link LanguageCode#ARABIC} language. Requires <code>lucene-contrib</code>
- * to be present in classpath, otherwise an empty (identity) stemmer is
- * returned.
- */
- private static class ArabicStemmerFactory {
- static {
- try {
- ReflectionUtils.classForName(ArabicStemmer.class.getName(), false);
- ReflectionUtils.classForName(ArabicNormalizer.class.getName(), false);
- } catch (ClassNotFoundException e) {
- logger
- .warn(
- "Could not instantiate Lucene stemmer for Arabic, clustering quality "
- + "of Arabic content may be degraded. For best quality clusters, "
- + "make sure Lucene's Arabic analyzer JAR is in the classpath",
- e);
- }
- }
-
- /**
- * Adapter to lucene-contrib Arabic analyzers.
- */
- private static class LuceneStemmerAdapter implements IStemmer {
- private final org.apache.lucene.analysis.ar.ArabicStemmer delegate;
- private final org.apache.lucene.analysis.ar.ArabicNormalizer normalizer;
-
- private char[] buffer = new char[0];
-
- private LuceneStemmerAdapter() throws Exception {
- delegate = new org.apache.lucene.analysis.ar.ArabicStemmer();
- normalizer = new org.apache.lucene.analysis.ar.ArabicNormalizer();
- }
-
- public CharSequence stem(CharSequence word) {
- if (word.length() > buffer.length) {
- buffer = new char[word.length()];
- }
-
- for (int i = 0; i < word.length(); i++) {
- buffer[i] = word.charAt(i);
- }
-
- int newLen = normalizer.normalize(buffer, word.length());
- newLen = delegate.stem(buffer, newLen);
-
- if (newLen != word.length() || !equals(buffer, newLen, word)) {
- return CharBuffer.wrap(buffer, 0, newLen);
- }
-
- // Same-same.
- return null;
- }
-
- private boolean equals(char[] buffer, int len, CharSequence word) {
- assert len == word.length();
-
- for (int i = 0; i < len; i++) {
- if (buffer[i] != word.charAt(i))
- return false;
- }
-
- return true;
- }
- }
-
- public static IStemmer createStemmer() {
- try {
- return new LuceneStemmerAdapter();
- } catch (Throwable e) {
- return IdentityStemmer.INSTANCE;
- }
- }
- }
-
- /**
- * An implementation of {@link IStemmer} that always returns <code>null</code>
- * which means no stemming.
- */
- private static class IdentityStemmer implements IStemmer {
- private final static IdentityStemmer INSTANCE = new IdentityStemmer();
-
- @Override
- public CharSequence stem(CharSequence word) {
- return null;
- }
- }
+ @Override
+ public IStemmer getStemmer(LanguageCode language) {
+ switch (language) {
+ case ARABIC:
+ return ArabicStemmerFactory.createStemmer();
+
+ case CHINESE_SIMPLIFIED:
+ return IdentityStemmer.INSTANCE;
+
+ default:
+ /*
+ * For other languages, try to use snowball's stemming.
+ */
+ return SnowballStemmerFactory.createStemmer(language);
+ }
+ }
+
+ /**
+ * Factory of {@link IStemmer} implementations from the <code>snowball</code>
+ * project.
+ */
+ private final static class SnowballStemmerFactory {
+ /**
+ * Static hard mapping from language codes to stemmer classes in Snowball.
+ * This mapping is not dynamic because we want to keep the possibility to
+ * obfuscate these classes.
+ */
+ private static HashMap<LanguageCode, Class<? extends SnowballProgram>> snowballStemmerClasses;
+ static {
+ snowballStemmerClasses = new HashMap<LanguageCode, Class<? extends SnowballProgram>>();
+ snowballStemmerClasses.put(LanguageCode.DANISH, DanishStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.DUTCH, DutchStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.ENGLISH, EnglishStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.FINNISH, FinnishStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.FRENCH, FrenchStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.GERMAN, GermanStemmer.class);
+ snowballStemmerClasses
+ .put(LanguageCode.HUNGARIAN, HungarianStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.ITALIAN, ItalianStemmer.class);
+ snowballStemmerClasses
+ .put(LanguageCode.NORWEGIAN, NorwegianStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.PORTUGUESE,
+ PortugueseStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.ROMANIAN, RomanianStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.RUSSIAN, RussianStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.SPANISH, SpanishStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.SWEDISH, SwedishStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.TURKISH, TurkishStemmer.class);
+ }
+
+ /**
+ * An adapter converting Snowball programs into {@link IStemmer} interface.
+ */
+ private static class SnowballStemmerAdapter implements IStemmer {
+ private final SnowballProgram snowballStemmer;
+
+ public SnowballStemmerAdapter(SnowballProgram snowballStemmer) {
+ this.snowballStemmer = snowballStemmer;
+ }
+
+ public CharSequence stem(CharSequence word) {
+ snowballStemmer.setCurrent(word.toString());
+ if (snowballStemmer.stem()) {
+ return snowballStemmer.getCurrent();
+ } else {
+ return null;
+ }
+ }
+ }
+
+ /**
+ * Create and return an {@link IStemmer} adapter for a
+ * {@link SnowballProgram} for a given language code. An identity stemmer is
+ * returned for unknown languages.
+ */
+ public static IStemmer createStemmer(LanguageCode language) {
+ final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses
+ .get(language);
+
+ if (stemmerClazz == null) {
+ logger.warn("No Snowball stemmer class for: " + language.name()
+ + ". Quality of clustering may be degraded.");
+ return IdentityStemmer.INSTANCE;
+ }
+
+ try {
+ return new SnowballStemmerAdapter(stemmerClazz.newInstance());
+ } catch (Exception e) {
+ logger.warn("Could not instantiate snowball stemmer"
+ + " for language: " + language.name()
+ + ". Quality of clustering may be degraded.", e);
+
+ return IdentityStemmer.INSTANCE;
+ }
+ }
+ }
+
+ /**
+ * Factory of {@link IStemmer} implementations for the
+ * {@link LanguageCode#ARABIC} language. Requires <code>lucene-contrib</code>
+ * to be present in classpath, otherwise an empty (identity) stemmer is
+ * returned.
+ */
+ private static class ArabicStemmerFactory {
+ static {
+ try {
+ ReflectionUtils.classForName(ArabicStemmer.class.getName(), false);
+ ReflectionUtils.classForName(ArabicNormalizer.class.getName(), false);
+ } catch (ClassNotFoundException e) {
+ logger
+ .warn(
+ "Could not instantiate Lucene stemmer for Arabic, clustering quality "
+ + "of Arabic content may be degraded. For best quality clusters, "
+ + "make sure Lucene's Arabic analyzer JAR is in the classpath",
+ e);
+ }
+ }
+
+ /**
+ * Adapter to lucene-contrib Arabic analyzers.
+ */
+ private static class LuceneStemmerAdapter implements IStemmer {
+ private final org.apache.lucene.analysis.ar.ArabicStemmer delegate;
+ private final org.apache.lucene.analysis.ar.ArabicNormalizer normalizer;
+
+ private char[] buffer = new char[0];
+
+ private LuceneStemmerAdapter() throws Exception {
+ delegate = new org.apache.lucene.analysis.ar.ArabicStemmer();
+ normalizer = new org.apache.lucene.analysis.ar.ArabicNormalizer();
+ }
+
+ public CharSequence stem(CharSequence word) {
+ if (word.length() > buffer.length) {
+ buffer = new char[word.length()];
+ }
+
+ for (int i = 0; i < word.length(); i++) {
+ buffer[i] = word.charAt(i);
+ }
+
+ int newLen = normalizer.normalize(buffer, word.length());
+ newLen = delegate.stem(buffer, newLen);
+
+ if (newLen != word.length() || !equals(buffer, newLen, word)) {
+ return CharBuffer.wrap(buffer, 0, newLen);
+ }
+
+ // Same-same.
+ return null;
+ }
+
+ private boolean equals(char[] buffer, int len, CharSequence word) {
+ assert len == word.length();
+
+ for (int i = 0; i < len; i++) {
+ if (buffer[i] != word.charAt(i))
+ return false;
+ }
+
+ return true;
+ }
+ }
+
+ public static IStemmer createStemmer() {
+ try {
+ return new LuceneStemmerAdapter();
+ } catch (Throwable e) {
+ return IdentityStemmer.INSTANCE;
+ }
+ }
+ }
+
+ /**
+ * An implementation of {@link IStemmer} that always returns <code>null</code>
+ * which means no stemming.
+ */
+ private static class IdentityStemmer implements IStemmer {
+ private final static IdentityStemmer INSTANCE = new IdentityStemmer();
+
+ @Override
+ public CharSequence stem(CharSequence word) {
+ return null;
+ }
+ }
}
Modified: lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java?rev=1212486&r1=1212485&r2=1212486&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java Fri Dec 9 15:54:39 2011
@@ -40,117 +40,117 @@ import org.slf4j.Logger;
* Lucene APIs need to change, the changes can be made in this class.
*/
public class LuceneCarrot2TokenizerFactory implements ITokenizerFactory {
- final static Logger logger = org.slf4j.LoggerFactory
- .getLogger(LuceneCarrot2TokenizerFactory.class);
+ final static Logger logger = org.slf4j.LoggerFactory
+ .getLogger(LuceneCarrot2TokenizerFactory.class);
- @Override
- public ITokenizer getTokenizer(LanguageCode language) {
- switch (language) {
- case CHINESE_SIMPLIFIED:
- return ChineseTokenizerFactory.createTokenizer();
-
- /*
- * We use our own analyzer for Arabic. Lucene's version has special
- * support for Nonspacing-Mark characters (see
- * http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
- * have them included as letters in the parser.
- */
- case ARABIC:
- // Intentional fall-through.
-
- default:
- return new ExtendedWhitespaceTokenizer();
- }
- }
-
- /**
- * Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
- * {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
- * factory will fall back to the default white space tokenizer.
- */
- private static final class ChineseTokenizerFactory {
- static {
- try {
- ReflectionUtils.classForName(
- "org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
- ReflectionUtils.classForName(
- "org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
- } catch (Throwable e) {
- logger
- .warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
- + "of Chinese content may be degraded. For best quality clusters, "
- + "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
- }
- }
-
- static ITokenizer createTokenizer() {
- try {
- return new ChineseTokenizer();
- } catch (Throwable e) {
- return new ExtendedWhitespaceTokenizer();
- }
- }
-
- private final static class ChineseTokenizer implements ITokenizer {
- private final static Pattern numeric = Pattern
- .compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
-
- private Tokenizer sentenceTokenizer;
- private TokenStream wordTokenFilter;
- private CharTermAttribute term = null;
-
- private final MutableCharArray tempCharSequence;
- private final Class<?> tokenFilterClass;
-
- private ChineseTokenizer() throws Exception {
- this.tempCharSequence = new MutableCharArray(new char[0]);
-
- // As Smart Chinese is not available during compile time,
- // we need to resort to reflection.
- final Class<?> tokenizerClass = ReflectionUtils.classForName(
- "org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
- this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
- Reader.class).newInstance((Reader) null);
- this.tokenFilterClass = ReflectionUtils.classForName(
- "org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
- }
-
- public short nextToken() throws IOException {
- final boolean hasNextToken = wordTokenFilter.incrementToken();
- if (hasNextToken) {
- short flags = 0;
- final char[] image = term.buffer();
- final int length = term.length();
- tempCharSequence.reset(image, 0, length);
- if (length == 1 && image[0] == ',') {
- // ChineseTokenizer seems to convert all punctuation to ','
- // characters
- flags = ITokenizer.TT_PUNCTUATION;
- } else if (numeric.matcher(tempCharSequence).matches()) {
- flags = ITokenizer.TT_NUMERIC;
- } else {
- flags = ITokenizer.TT_TERM;
- }
- return flags;
- }
-
- return ITokenizer.TT_EOF;
- }
-
- public void setTermBuffer(MutableCharArray array) {
- array.reset(term.buffer(), 0, term.length());
- }
-
- public void reset(Reader input) throws IOException {
- try {
- sentenceTokenizer.reset(input);
- wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
- TokenStream.class).newInstance(sentenceTokenizer);
+ @Override
+ public ITokenizer getTokenizer(LanguageCode language) {
+ switch (language) {
+ case CHINESE_SIMPLIFIED:
+ return ChineseTokenizerFactory.createTokenizer();
+
+ /*
+ * We use our own analyzer for Arabic. Lucene's version has special
+ * support for Nonspacing-Mark characters (see
+ * http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
+ * have them included as letters in the parser.
+ */
+ case ARABIC:
+ // Intentional fall-through.
+
+ default:
+ return new ExtendedWhitespaceTokenizer();
+ }
+ }
+
+ /**
+ * Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
+ * {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
+ * factory will fall back to the default white space tokenizer.
+ */
+ private static final class ChineseTokenizerFactory {
+ static {
+ try {
+ ReflectionUtils.classForName(
+ "org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
+ ReflectionUtils.classForName(
+ "org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
+ } catch (Throwable e) {
+ logger
+ .warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
+ + "of Chinese content may be degraded. For best quality clusters, "
+ + "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
+ }
+ }
+
+ static ITokenizer createTokenizer() {
+ try {
+ return new ChineseTokenizer();
+ } catch (Throwable e) {
+ return new ExtendedWhitespaceTokenizer();
+ }
+ }
+
+ private final static class ChineseTokenizer implements ITokenizer {
+ private final static Pattern numeric = Pattern
+ .compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
+
+ private Tokenizer sentenceTokenizer;
+ private TokenStream wordTokenFilter;
+ private CharTermAttribute term = null;
+
+ private final MutableCharArray tempCharSequence;
+ private final Class<?> tokenFilterClass;
+
+ private ChineseTokenizer() throws Exception {
+ this.tempCharSequence = new MutableCharArray(new char[0]);
+
+ // As Smart Chinese is not available during compile time,
+ // we need to resort to reflection.
+ final Class<?> tokenizerClass = ReflectionUtils.classForName(
+ "org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
+ this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
+ Reader.class).newInstance((Reader) null);
+ this.tokenFilterClass = ReflectionUtils.classForName(
+ "org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
+ }
+
+ public short nextToken() throws IOException {
+ final boolean hasNextToken = wordTokenFilter.incrementToken();
+ if (hasNextToken) {
+ short flags = 0;
+ final char[] image = term.buffer();
+ final int length = term.length();
+ tempCharSequence.reset(image, 0, length);
+ if (length == 1 && image[0] == ',') {
+ // ChineseTokenizer seems to convert all punctuation to ','
+ // characters
+ flags = ITokenizer.TT_PUNCTUATION;
+ } else if (numeric.matcher(tempCharSequence).matches()) {
+ flags = ITokenizer.TT_NUMERIC;
+ } else {
+ flags = ITokenizer.TT_TERM;
+ }
+ return flags;
+ }
+
+ return ITokenizer.TT_EOF;
+ }
+
+ public void setTermBuffer(MutableCharArray array) {
+ array.reset(term.buffer(), 0, term.length());
+ }
+
+ public void reset(Reader input) throws IOException {
+ try {
+ sentenceTokenizer.reset(input);
+ wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
+ TokenStream.class).newInstance(sentenceTokenizer);
term = wordTokenFilter.addAttribute(CharTermAttribute.class);
- } catch (Exception e) {
- throw ExceptionUtils.wrapAsRuntimeException(e);
- }
- }
- }
- }
+ } catch (Exception e) {
+ throw ExceptionUtils.wrapAsRuntimeException(e);
+ }
+ }
+ }
+ }
}
Modified: lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java?rev=1212486&r1=1212485&r2=1212486&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java Fri Dec 9 15:54:39 2011
@@ -53,89 +53,89 @@ import com.google.common.collect.Multima
*/
@Bindable
public class SolrStopwordsCarrot2LexicalDataFactory implements
- ILexicalDataFactory {
- final static Logger logger = org.slf4j.LoggerFactory
- .getLogger(SolrStopwordsCarrot2LexicalDataFactory.class);
-
- @Init
- @Input
- @Attribute(key = "solrIndexSchema")
- private IndexSchema schema;
-
- @Processing
- @Input
- @Attribute(key = "solrFieldNames")
- private Set<String> fieldNames;
-
- /**
- * A lazily-built cache of stop words per field.
- */
- private Multimap<String, CharArraySet> solrStopWords = HashMultimap.create();
-
- /**
- * Carrot2's default lexical resources to use in addition to Solr's stop
- * words.
- */
- private DefaultLexicalDataFactory carrot2LexicalDataFactory = new DefaultLexicalDataFactory();
-
- /**
- * Obtains stop words for a field from the associated
- * {@link StopFilterFactory}, if any.
- */
- private Collection<CharArraySet> getSolrStopWordsForField(String fieldName) {
- // No need to synchronize here, Carrot2 ensures that instances
- // of this class are not used by multiple threads at a time.
- if (!solrStopWords.containsKey(fieldName)) {
- final Analyzer fieldAnalyzer = schema.getFieldType(fieldName)
- .getAnalyzer();
- if (fieldAnalyzer instanceof TokenizerChain) {
- final TokenFilterFactory[] filterFactories = ((TokenizerChain) fieldAnalyzer)
- .getTokenFilterFactories();
- for (TokenFilterFactory factory : filterFactories) {
- if (factory instanceof StopFilterFactory) {
- // StopFilterFactory holds the stop words in a CharArraySet, but
- // the getStopWords() method returns a Set<?>, so we need to cast.
- solrStopWords.put(fieldName,
- (CharArraySet) ((StopFilterFactory) factory).getStopWords());
- }
-
- if (factory instanceof CommonGramsFilterFactory) {
- solrStopWords.put(fieldName,
- (CharArraySet) ((CommonGramsFilterFactory) factory)
- .getCommonWords());
- }
- }
- }
- }
- return solrStopWords.get(fieldName);
- }
-
- @Override
- public ILexicalData getLexicalData(LanguageCode languageCode) {
- final ILexicalData carrot2LexicalData = carrot2LexicalDataFactory
- .getLexicalData(languageCode);
-
- return new ILexicalData() {
- @Override
- public boolean isStopLabel(CharSequence word) {
- // Nothing in Solr maps to the concept of a stop label,
- // so return Carrot2's default here.
- return carrot2LexicalData.isStopLabel(word);
- }
-
- @Override
- public boolean isCommonWord(MutableCharArray word) {
- // Loop over the fields involved in clustering first
- for (String fieldName : fieldNames) {
- for (CharArraySet stopWords : getSolrStopWordsForField(fieldName)) {
- if (stopWords.contains(word)) {
- return true;
- }
- }
- }
- // Check default Carrot2 stop words too
- return carrot2LexicalData.isCommonWord(word);
- }
- };
- }
+ ILexicalDataFactory {
+ final static Logger logger = org.slf4j.LoggerFactory
+ .getLogger(SolrStopwordsCarrot2LexicalDataFactory.class);
+
+ @Init
+ @Input
+ @Attribute(key = "solrIndexSchema")
+ private IndexSchema schema;
+
+ @Processing
+ @Input
+ @Attribute(key = "solrFieldNames")
+ private Set<String> fieldNames;
+
+ /**
+ * A lazily-built cache of stop words per field.
+ */
+ private Multimap<String, CharArraySet> solrStopWords = HashMultimap.create();
+
+ /**
+ * Carrot2's default lexical resources to use in addition to Solr's stop
+ * words.
+ */
+ private DefaultLexicalDataFactory carrot2LexicalDataFactory = new DefaultLexicalDataFactory();
+
+ /**
+ * Obtains stop words for a field from the associated
+ * {@link StopFilterFactory}, if any.
+ */
+ private Collection<CharArraySet> getSolrStopWordsForField(String fieldName) {
+ // No need to synchronize here, Carrot2 ensures that instances
+ // of this class are not used by multiple threads at a time.
+ if (!solrStopWords.containsKey(fieldName)) {
+ final Analyzer fieldAnalyzer = schema.getFieldType(fieldName)
+ .getAnalyzer();
+ if (fieldAnalyzer instanceof TokenizerChain) {
+ final TokenFilterFactory[] filterFactories = ((TokenizerChain) fieldAnalyzer)
+ .getTokenFilterFactories();
+ for (TokenFilterFactory factory : filterFactories) {
+ if (factory instanceof StopFilterFactory) {
+ // StopFilterFactory holds the stop words in a CharArraySet, but
+ // the getStopWords() method returns a Set<?>, so we need to cast.
+ solrStopWords.put(fieldName,
+ (CharArraySet) ((StopFilterFactory) factory).getStopWords());
+ }
+
+ if (factory instanceof CommonGramsFilterFactory) {
+ solrStopWords.put(fieldName,
+ (CharArraySet) ((CommonGramsFilterFactory) factory)
+ .getCommonWords());
+ }
+ }
+ }
+ }
+ return solrStopWords.get(fieldName);
+ }
+
+ @Override
+ public ILexicalData getLexicalData(LanguageCode languageCode) {
+ final ILexicalData carrot2LexicalData = carrot2LexicalDataFactory
+ .getLexicalData(languageCode);
+
+ return new ILexicalData() {
+ @Override
+ public boolean isStopLabel(CharSequence word) {
+ // Nothing in Solr maps to the concept of a stop label,
+ // so return Carrot2's default here.
+ return carrot2LexicalData.isStopLabel(word);
+ }
+
+ @Override
+ public boolean isCommonWord(MutableCharArray word) {
+ // Loop over the fields involved in clustering first
+ for (String fieldName : fieldNames) {
+ for (CharArraySet stopWords : getSolrStopWordsForField(fieldName)) {
+ if (stopWords.contains(word)) {
+ return true;
+ }
+ }
+ }
+ // Check default Carrot2 stop words too
+ return carrot2LexicalData.isCommonWord(word);
+ }
+ };
+ }
}
Modified: lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java?rev=1212486&r1=1212485&r2=1212486&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java Fri Dec 9 15:54:39 2011
@@ -50,10 +50,10 @@ import com.google.common.collect.Immutab
public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
@Test
public void testCarrotLingo() throws Exception {
- // Note: the expected number of clusters may change after upgrading Carrot2
- // due to e.g. internal improvements or tuning of Carrot2 clustering.
+ // Note: the expected number of clusters may change after upgrading Carrot2
+ // due to e.g. internal improvements or tuning of Carrot2 clustering.
final int expectedNumClusters = 10;
- checkEngine(getClusteringEngine("default"), expectedNumClusters);
+ checkEngine(getClusteringEngine("default"), expectedNumClusters);
}
@Test
@@ -169,66 +169,66 @@ public class CarrotClusteringEngineTest
params), 1, 3, 0);
}
- @Test
- public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception {
- checkLexicalResourcesFromSolrConfig("lexical-resource-check",
- "online,customsolrstopword,customsolrstoplabel");
- }
-
- @Test
- public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception {
- checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir",
- "online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
- }
-
- private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
- throws IOException {
- ModifiableSolrParams params = new ModifiableSolrParams();
- params.set("merge-resources", false);
- params.set(AttributeUtils.getKey(
- LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
- wordsToCheck);
-
- // "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in
- // stoplabels.mt, so we're expecting only one cluster with label "online".
- final List<NamedList<Object>> clusters = checkEngine(
- getClusteringEngine(engineName), 1, params);
- assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
- }
-
- @Test
- public void solrStopWordsUsedInCarrot2Clustering() throws Exception {
- ModifiableSolrParams params = new ModifiableSolrParams();
- params.set("merge-resources", false);
- params.set(AttributeUtils.getKey(
- LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
- "online,solrownstopword");
-
- // "solrownstopword" is in stopwords.txt, so we're expecting
- // only one cluster with label "online".
- final List<NamedList<Object>> clusters = checkEngine(
- getClusteringEngine("lexical-resource-check"), 1, params);
- assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
- }
-
- @Test
- public void solrStopWordsNotDefinedOnAFieldForClustering() throws Exception {
- ModifiableSolrParams params = new ModifiableSolrParams();
- // Force string fields to be used for clustering. Does not make sense
- // in a real word, but does the job in the test.
- params.set(CarrotParams.TITLE_FIELD_NAME, "url");
- params.set(CarrotParams.SNIPPET_FIELD_NAME, "url");
- params.set("merge-resources", false);
- params.set(AttributeUtils.getKey(
- LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
- "online,solrownstopword");
-
- final List<NamedList<Object>> clusters = checkEngine(
- getClusteringEngine("lexical-resource-check"), 2, params);
- assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0)));
- assertEquals(ImmutableList.of("solrownstopword"),
- getLabels(clusters.get(1)));
- }
+ @Test
+ public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception {
+ checkLexicalResourcesFromSolrConfig("lexical-resource-check",
+ "online,customsolrstopword,customsolrstoplabel");
+ }
+
+ @Test
+ public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception {
+ checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir",
+ "online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
+ }
+
+ private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
+ throws IOException {
+ ModifiableSolrParams params = new ModifiableSolrParams();
+ params.set("merge-resources", false);
+ params.set(AttributeUtils.getKey(
+ LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+ wordsToCheck);
+
+ // "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in
+ // stoplabels.mt, so we're expecting only one cluster with label "online".
+ final List<NamedList<Object>> clusters = checkEngine(
+ getClusteringEngine(engineName), 1, params);
+ assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
+ }
+
+ @Test
+ public void solrStopWordsUsedInCarrot2Clustering() throws Exception {
+ ModifiableSolrParams params = new ModifiableSolrParams();
+ params.set("merge-resources", false);
+ params.set(AttributeUtils.getKey(
+ LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+ "online,solrownstopword");
+
+ // "solrownstopword" is in stopwords.txt, so we're expecting
+ // only one cluster with label "online".
+ final List<NamedList<Object>> clusters = checkEngine(
+ getClusteringEngine("lexical-resource-check"), 1, params);
+ assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
+ }
+
+ @Test
+ public void solrStopWordsNotDefinedOnAFieldForClustering() throws Exception {
+ ModifiableSolrParams params = new ModifiableSolrParams();
+ // Force string fields to be used for clustering. Does not make sense
+ // in a real word, but does the job in the test.
+ params.set(CarrotParams.TITLE_FIELD_NAME, "url");
+ params.set(CarrotParams.SNIPPET_FIELD_NAME, "url");
+ params.set("merge-resources", false);
+ params.set(AttributeUtils.getKey(
+ LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+ "online,solrownstopword");
+
+ final List<NamedList<Object>> clusters = checkEngine(
+ getClusteringEngine("lexical-resource-check"), 2, params);
+ assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0)));
+ assertEquals(ImmutableList.of("solrownstopword"),
+ getLabels(clusters.get(1)));
+ }
private CarrotClusteringEngine getClusteringEngine(String engineName) {
ClusteringComponent comp = (ClusteringComponent) h.getCore()
@@ -273,7 +273,7 @@ public class CarrotClusteringEngineTest
SolrDocumentList solrDocList = SolrPluginUtils.docListToSolrDocumentList( docList, searcher, engine.getFieldsToLoad(req), docIds );
@SuppressWarnings("unchecked")
- List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req);
+ List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req);
req.close();
assertEquals("number of clusters: " + results, expectedNumClusters, results.size());
checkClusters(results, false);
@@ -331,26 +331,26 @@ public class CarrotClusteringEngineTest
}
}
- @SuppressWarnings("unchecked")
- private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) {
- return (List<NamedList<Object>>) cluster.get("clusters");
- }
-
- @SuppressWarnings("unchecked")
- private List<String> getLabels(NamedList<Object> cluster) {
- return (List<String>) cluster.get("labels");
- }
-
- private Double getScore(NamedList<Object> cluster) {
- return (Double) cluster.get("score");
- }
-
- private Boolean isOtherTopics(NamedList<Object> cluster) {
- return (Boolean)cluster.get("other-topics");
- }
-
- @SuppressWarnings("unchecked")
- private List<Object> getDocs(NamedList<Object> cluster) {
- return (List<Object>) cluster.get("docs");
- }
+ @SuppressWarnings("unchecked")
+ private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) {
+ return (List<NamedList<Object>>) cluster.get("clusters");
+ }
+
+ @SuppressWarnings("unchecked")
+ private List<String> getLabels(NamedList<Object> cluster) {
+ return (List<String>) cluster.get("labels");
+ }
+
+ private Double getScore(NamedList<Object> cluster) {
+ return (Double) cluster.get("score");
+ }
+
+ private Boolean isOtherTopics(NamedList<Object> cluster) {
+ return (Boolean)cluster.get("other-topics");
+ }
+
+ @SuppressWarnings("unchecked")
+ private List<Object> getDocs(NamedList<Object> cluster) {
+ return (List<Object>) cluster.get("docs");
+ }
}
Modified: lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java?rev=1212486&r1=1212485&r2=1212486&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java Fri Dec 9 15:54:39 2011
@@ -25,9 +25,7 @@ import org.carrot2.core.ProcessingCompon
import org.carrot2.core.ProcessingException;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.core.attribute.Processing;
-import org.carrot2.text.linguistic.DefaultLexicalDataFactory;
import org.carrot2.text.linguistic.ILexicalData;
-import org.carrot2.text.linguistic.ILexicalDataFactory;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.attribute.Attribute;
@@ -46,37 +44,37 @@ import com.google.common.collect.Lists;
*/
@Bindable(prefix = "LexicalResourcesCheckClusteringAlgorithm")
public class LexicalResourcesCheckClusteringAlgorithm extends
- ProcessingComponentBase implements IClusteringAlgorithm {
+ ProcessingComponentBase implements IClusteringAlgorithm {
- @Output
- @Processing
- @Attribute(key = AttributeNames.CLUSTERS)
- private List<Cluster> clusters;
-
- @Input
- @Processing
- @Attribute
- private String wordsToCheck;
-
- private BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
-
- @Override
- public void process() throws ProcessingException {
- clusters = Lists.newArrayList();
- if (wordsToCheck == null) {
- return;
- }
-
- // Test with Maltese so that the English clustering performed in other tests
- // is not affected by the test stopwords and stoplabels.
- ILexicalData lexicalData = preprocessing.lexicalDataFactory
- .getLexicalData(LanguageCode.MALTESE);
-
- for (String word : wordsToCheck.split(",")) {
- if (!lexicalData.isCommonWord(new MutableCharArray(word))
- && !lexicalData.isStopLabel(word)) {
- clusters.add(new Cluster(word));
- }
- }
- }
+ @Output
+ @Processing
+ @Attribute(key = AttributeNames.CLUSTERS)
+ private List<Cluster> clusters;
+
+ @Input
+ @Processing
+ @Attribute
+ private String wordsToCheck;
+
+ private BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
+
+ @Override
+ public void process() throws ProcessingException {
+ clusters = Lists.newArrayList();
+ if (wordsToCheck == null) {
+ return;
+ }
+
+ // Test with Maltese so that the English clustering performed in other tests
+ // is not affected by the test stopwords and stoplabels.
+ ILexicalData lexicalData = preprocessing.lexicalDataFactory
+ .getLexicalData(LanguageCode.MALTESE);
+
+ for (String word : wordsToCheck.split(",")) {
+ if (!lexicalData.isCommonWord(new MutableCharArray(word))
+ && !lexicalData.isStopLabel(word)) {
+ clusters.add(new Cluster(word));
+ }
+ }
+ }
}