You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2011/05/22 23:45:45 UTC

svn commit: r1126234 [23/28] - in /lucene/dev/branches/solr2452: ./ dev-tools/eclipse/ dev-tools/idea/ dev-tools/idea/.idea/ dev-tools/idea/lucene/ dev-tools/idea/lucene/contrib/ant/ dev-tools/idea/lucene/contrib/db/bdb-je/ dev-tools/idea/lucene/contri...

Modified: lucene/dev/branches/solr2452/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java Sun May 22 21:45:19 2011
@@ -18,9 +18,11 @@ package org.apache.solr.handler.clusteri
  */
 
 import java.io.IOException;
+import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@@ -37,6 +39,7 @@ import org.apache.solr.common.params.Sol
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.SimpleOrderedMap;
 import org.apache.solr.core.SolrCore;
+import org.apache.solr.core.SolrResourceLoader;
 import org.apache.solr.handler.clustering.SearchClusteringEngine;
 import org.apache.solr.handler.component.HighlightComponent;
 import org.apache.solr.highlight.SolrHighlighter;
@@ -52,9 +55,17 @@ import org.carrot2.core.ControllerFactor
 import org.carrot2.core.Document;
 import org.carrot2.core.IClusteringAlgorithm;
 import org.carrot2.core.attribute.AttributeNames;
+import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
+import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
+import org.carrot2.util.resource.ClassLoaderLocator;
+import org.carrot2.util.resource.IResource;
+import org.carrot2.util.resource.IResourceLocator;
+import org.carrot2.util.resource.ResourceLookup;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
 
 /**
@@ -64,19 +75,33 @@ import com.google.common.collect.Sets;
  *
  * @link http://project.carrot2.org
  */
-@SuppressWarnings("unchecked")
 public class CarrotClusteringEngine extends SearchClusteringEngine {
-  private transient static Logger log = LoggerFactory
+	private transient static Logger log = LoggerFactory
           .getLogger(CarrotClusteringEngine.class);
 
+	/**
+	 * The subdirectory in Solr config dir to read customized Carrot2 resources from.
+	 */
+	private static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
+
+  /**
+   * Name of Carrot2 document's field containing Solr document's identifier.
+   */
+  private static final String SOLR_DOCUMENT_ID = "solrId";
+
+  /**
+   * Name of Solr document's field containing the document's identifier. To avoid
+   * repeating the content of documents in clusters on output, each cluster contains
+   * identifiers of documents it contains.
+   */
+  private String idFieldName;
+
   /**
    * Carrot2 controller that manages instances of clustering algorithms
    */
   private Controller controller = ControllerFactory.createPooling();
   private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
 
-  private String idFieldName;
-
   @Override
   @Deprecated
   public Object cluster(Query query, DocList docList, SolrQueryRequest sreq) {
@@ -101,6 +126,10 @@ public class CarrotClusteringEngine exte
       attributes.put(AttributeNames.DOCUMENTS, documents);
       attributes.put(AttributeNames.QUERY, query.toString());
 
+      // Pass the fields on which clustering runs to the
+      // SolrStopwordsCarrot2LexicalDataFactory
+      attributes.put("solrFieldNames", getFieldsForClustering(sreq));
+
       // Pass extra overriding attributes from the request, if any
       extractCarrotAttributes(sreq.getParams(), attributes);
 
@@ -113,22 +142,68 @@ public class CarrotClusteringEngine exte
     }
   }
 
-  @Override
+	@Override
+	@SuppressWarnings({ "unchecked", "rawtypes" })
   public String init(NamedList config, final SolrCore core) {
     String result = super.init(config, core);
-    SolrParams initParams = SolrParams.toSolrParams(config);
+    final SolrParams initParams = SolrParams.toSolrParams(config);
 
     // Initialize Carrot2 controller. Pass initialization attributes, if any.
     HashMap<String, Object> initAttributes = new HashMap<String, Object>();
     extractCarrotAttributes(initParams, initAttributes);
-    
-    // Customize the language model factory. The implementation we provide here
-    // is included in the code base of Solr, so that it's possible to refactor
-    // the Lucene APIs the factory relies on if needed.
-    initAttributes.put("PreprocessingPipeline.languageModelFactory",
-      LuceneLanguageModelFactory.class);
-    this.controller.init(initAttributes);
 
+    // Customize the stemmer and tokenizer factories. The implementations we provide here
+    // are included in the code base of Solr, so that it's possible to refactor
+    // the Lucene APIs the factories rely on if needed.
+    // Additionally, we set a custom lexical resource factory for Carrot2 that
+    // will use both Carrot2 default stop words as well as stop words from
+    // the StopFilter defined on the field.
+		BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes)
+				.stemmerFactory(LuceneCarrot2StemmerFactory.class)
+				.tokenizerFactory(LuceneCarrot2TokenizerFactory.class)
+				.lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
+
+		// Pass the schema to SolrStopwordsCarrot2LexicalDataFactory.
+		initAttributes.put("solrIndexSchema", core.getSchema());
+
+    // Customize Carrot2's resource lookup to first look for resources
+    // using Solr's resource loader. If that fails, try loading from the classpath.
+    DefaultLexicalDataFactoryDescriptor.attributeBuilder(initAttributes)
+        .resourceLookup(new ResourceLookup(new IResourceLocator() {
+          @Override
+          public IResource[] getAll(final String resource) {
+            final SolrResourceLoader resourceLoader = core.getResourceLoader();
+            final String carrot2ResourcesDir = resourceLoader.getConfigDir()
+                + initParams.get(CarrotParams.LEXICAL_RESOURCES_DIR, CARROT_RESOURCES_PREFIX);
+            try {
+              log.debug("Looking for " + resource + " in "
+                  + carrot2ResourcesDir);
+              final InputStream resourceStream = resourceLoader
+                  .openResource(carrot2ResourcesDir + "/" + resource);
+
+              log.info(resource + " loaded from " + carrot2ResourcesDir);
+              final IResource foundResource = new IResource() {
+                @Override
+                public InputStream open() throws IOException {
+                  return resourceStream;
+                }
+              };
+              return new IResource[] { foundResource };
+            } catch (RuntimeException e) {
+              // No way to distinguish if the resource was found but failed
+              // to load or wasn't found at all, so we simply fall back
+              // to Carrot2 defaults here by returning an empty locations array.
+              log.debug(resource + " not found in " + carrot2ResourcesDir
+                  + ". Using the default " + resource + " from Carrot JAR.");
+              return new IResource[] {};
+            }
+          }
+        },
+
+        // Using the class loader directly because this time we want to omit the prefix
+        new ClassLoaderLocator(core.getResourceLoader().getClassLoader())));
+
+    this.controller.init(initAttributes);
     this.idFieldName = core.getSchema().getUniqueKeyField().getName();
 
     // Make sure the requested Carrot2 clustering algorithm class is available
@@ -148,17 +223,29 @@ public class CarrotClusteringEngine exte
   protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){
     SolrParams solrParams = sreq.getParams();
 
-    // Names of fields to deliver content for clustering
-    String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
+    HashSet<String> fields = Sets.newHashSet(getFieldsForClustering(sreq));
+    fields.add(idFieldName);
+    fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url"));
+		return fields;
+  }
+
+	/**
+	 * Returns the names of fields that will be delivering the actual
+	 * content for clustering. Currently, there are two such fields: document
+	 * title and document content.
+	 */
+	private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
+    SolrParams solrParams = sreq.getParams();
+
     String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
     String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
     if (StringUtils.isBlank(snippetField)) {
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
               + " must not be blank.");
     }
-    return Sets.newHashSet(urlField, titleField, snippetField, idFieldName);
-  }
-  
+    return Sets.newHashSet(titleField, snippetField);
+	}
+
   /**
    * Prepares Carrot2 documents for clustering.
    */
@@ -180,7 +267,7 @@ public class CarrotClusteringEngine exte
     if (produceSummary == true) {
       highlighter = HighlightComponent.getHighlighter(core);
       if (highlighter != null){
-        Map args = new HashMap();
+        Map<String, Object> args = Maps.newHashMap();
         snippetFieldAry = new String[]{snippetField};
         args.put(HighlightParams.FIELDS, snippetFieldAry);
         args.put(HighlightParams.HIGHLIGHT, "true");
@@ -214,11 +301,12 @@ public class CarrotClusteringEngine exte
       if (produceSummary && docIds != null) {
         docsHolder[0] = docIds.get(sdoc).intValue();
         DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
-        NamedList highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
+        NamedList<Object> highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
         if (highlights != null && highlights.size() == 1) {//should only be one value given our setup
           //should only be one document with one field
-          NamedList tmp = (NamedList) highlights.getVal(0);
-          String [] highlt = (String[]) tmp.get(snippetField);
+          @SuppressWarnings("unchecked")
+					NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
+          String [] highlt = tmp.get(snippetField);
           if (highlt != null && highlt.length == 1) {
             snippet = highlt[0];
           }
@@ -226,27 +314,13 @@ public class CarrotClusteringEngine exte
       }
       Document carrotDocument = new Document(getValue(sdoc, titleField),
               snippet, (String)sdoc.getFieldValue(urlField));
-      carrotDocument.setField("solrId", sdoc.getFieldValue(idFieldName));
+      carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
       result.add(carrotDocument);
     }
 
     return result;
   }
 
-  @Deprecated
-  protected String getValue(org.apache.lucene.document.Document doc,
-                            String field) {
-    StringBuilder result = new StringBuilder();
-    String[] vals = doc.getValues(field);
-    for (int i = 0; i < vals.length; i++) {
-      // Join multiple values with a period so that Carrot2 does not pick up
-      // phrases that cross field value boundaries (in most cases it would
-      // create useless phrases).
-      result.append(vals[i]).append(" . ");
-    }
-    return result.toString().trim();
-  }
-
   protected String getValue(SolrDocument sdoc, String field) {
     StringBuilder result = new StringBuilder();
     Collection<Object> vals = sdoc.getFieldValues(field);
@@ -261,9 +335,9 @@ public class CarrotClusteringEngine exte
     return result.toString().trim();
   }
 
-  private List clustersToNamedList(List<Cluster> carrotClusters,
+  private List<NamedList<Object>> clustersToNamedList(List<Cluster> carrotClusters,
                                    SolrParams solrParams) {
-    List result = new ArrayList();
+    List<NamedList<Object>> result = Lists.newArrayList();
     clustersToNamedList(carrotClusters, result, solrParams.getBool(
             CarrotParams.OUTPUT_SUB_CLUSTERS, true), solrParams.getInt(
             CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE));
@@ -271,25 +345,40 @@ public class CarrotClusteringEngine exte
   }
 
   private void clustersToNamedList(List<Cluster> outputClusters,
-                                   List parent, boolean outputSubClusters, int maxLabels) {
+                                   List<NamedList<Object>> parent, boolean outputSubClusters, int maxLabels) {
     for (Cluster outCluster : outputClusters) {
-      NamedList cluster = new SimpleOrderedMap();
+      NamedList<Object> cluster = new SimpleOrderedMap<Object>();
       parent.add(cluster);
 
+      // Add labels
       List<String> labels = outCluster.getPhrases();
-      if (labels.size() > maxLabels)
+      if (labels.size() > maxLabels) {
         labels = labels.subList(0, maxLabels);
+      }
       cluster.add("labels", labels);
 
+      // Add cluster score
+      final Double score = outCluster.getScore();
+      if (score != null) {
+        cluster.add("score", score);
+      }
+
+      // Add other topics marker
+      if (outCluster.isOtherTopics()) {
+        cluster.add("other-topics", outCluster.isOtherTopics());
+      }
+
+      // Add documents
       List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
-      List docList = new ArrayList();
+      List<Object> docList = Lists.newArrayList();
       cluster.add("docs", docList);
       for (Document doc : docs) {
-        docList.add(doc.getField("solrId"));
+        docList.add(doc.getField(SOLR_DOCUMENT_ID));
       }
 
-      if (outputSubClusters) {
-        List subclusters = new ArrayList();
+      // Add subclusters
+      if (outputSubClusters && !outCluster.getSubclusters().isEmpty()) {
+        List<NamedList<Object>> subclusters = Lists.newArrayList();
         cluster.add("clusters", subclusters);
         clustersToNamedList(outCluster.getSubclusters(), subclusters,
                 outputSubClusters, maxLabels);

Modified: lucene/dev/branches/solr2452/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java Sun May 22 21:45:19 2011
@@ -35,6 +35,8 @@ public interface CarrotParams {
   String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
   String SUMMARY_FRAGSIZE = CARROT_PREFIX + "fragzise";
 
+  String LEXICAL_RESOURCES_DIR = CARROT_PREFIX + "lexicalResourcesDir";
+
   public static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of(
           ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME,
           PRODUCE_SUMMARY, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS, SUMMARY_FRAGSIZE);

Modified: lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java Sun May 22 21:45:19 2011
@@ -17,6 +17,11 @@ package org.apache.solr.handler.clusteri
  * limitations under the License.
  */
 
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.Query;
@@ -37,15 +42,11 @@ import org.apache.solr.util.SolrPluginUt
 import org.carrot2.util.attribute.AttributeUtils;
 import org.junit.Test;
 
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import com.google.common.collect.ImmutableList;
 
 /**
  *
  */
-@SuppressWarnings("unchecked")
 public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
   @Test
   public void testCarrotLingo() throws Exception {
@@ -74,7 +75,7 @@ public class CarrotClusteringEngineTest 
 
   @Test
   public void testWithoutSubclusters() throws Exception {
-    checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs),
+    checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs),
             1, 1, 0);
   }
 
@@ -82,7 +83,7 @@ public class CarrotClusteringEngineTest 
   public void testWithSubclusters() throws Exception {
     ModifiableSolrParams params = new ModifiableSolrParams();
     params.set(CarrotParams.OUTPUT_SUB_CLUSTERS, true);
-    checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs), 1, 1, 2);
+    checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs), 1, 1, 2);
   }
 
   @Test
@@ -90,19 +91,107 @@ public class CarrotClusteringEngineTest 
     ModifiableSolrParams params = new ModifiableSolrParams();
     params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 5);
     params.set(CarrotParams.NUM_DESCRIPTIONS, 3);
-    checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
+    checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
             params), 1, 3, 0);
   }
 
   @Test
+  public void testClusterScores() throws Exception {
+    ModifiableSolrParams params = new ModifiableSolrParams();
+    params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
+    List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
+        AbstractClusteringTestCase.numberOfDocs, params);
+    int i = 1;
+    for (NamedList<Object> cluster : clusters) {
+      final Double score = getScore(cluster);
+      assertNotNull(score);
+      assertEquals(0.25 * i++, score, 0);
+    }
+  }
+
+  @Test
+  public void testOtherTopics() throws Exception {
+    ModifiableSolrParams params = new ModifiableSolrParams();
+    params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
+    params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "otherTopicsModulo"), 2);
+    List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
+        AbstractClusteringTestCase.numberOfDocs, params);
+    int i = 1;
+    for (NamedList<Object> cluster : clusters) {
+      assertEquals(i++ % 2 == 0 ? true : null, isOtherTopics(cluster));
+    }
+  }
+
+  @Test
   public void testCarrotAttributePassing() throws Exception {
     ModifiableSolrParams params = new ModifiableSolrParams();
     params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
     params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 3);
-    checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
+    checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
             params), 1, 3, 0);
   }
 
+	@Test
+	public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception {
+		checkLexicalResourcesFromSolrConfig("lexical-resource-check",
+				"online,customsolrstopword,customsolrstoplabel");
+	}
+
+	@Test
+	public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception {
+		checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir",
+				"online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
+	}
+
+	private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
+			throws IOException {
+		ModifiableSolrParams params = new ModifiableSolrParams();
+		params.set("merge-resources", false);
+		params.set(AttributeUtils.getKey(
+				LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+				wordsToCheck);
+
+		// "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in
+		// stoplabels.en, so we're expecting only one cluster with label "online".
+		final List<NamedList<Object>> clusters = checkEngine(
+				getClusteringEngine(engineName), 1, params);
+		assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
+	}
+
+	@Test
+	public void solrStopWordsUsedInCarrot2Clustering() throws Exception {
+		ModifiableSolrParams params = new ModifiableSolrParams();
+		params.set("merge-resources", false);
+		params.set(AttributeUtils.getKey(
+				LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+		"online,solrownstopword");
+
+		// "solrownstopword" is in stopwords.txt, so we're expecting
+		// only one cluster with label "online".
+		final List<NamedList<Object>> clusters = checkEngine(
+				getClusteringEngine("lexical-resource-check"), 1, params);
+		assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
+	}
+
+	@Test
+	public void solrStopWordsNotDefinedOnAFieldForClustering() throws Exception {
+		ModifiableSolrParams params = new ModifiableSolrParams();
+		// Force string fields to be used for clustering. Does not make sense
+		// in a real word, but does the job in the test.
+		params.set(CarrotParams.TITLE_FIELD_NAME, "url");
+		params.set(CarrotParams.SNIPPET_FIELD_NAME, "url");
+		params.set("merge-resources", false);
+		params.set(AttributeUtils.getKey(
+				LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+		"online,solrownstopword");
+
+		final List<NamedList<Object>> clusters = checkEngine(
+				getClusteringEngine("lexical-resource-check"), 2, params);
+		assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0)));
+		assertEquals(ImmutableList.of("solrownstopword"),
+				getLabels(clusters.get(1)));
+	}
+
   private CarrotClusteringEngine getClusteringEngine(String engineName) {
     ClusteringComponent comp = (ClusteringComponent) h.getCore()
             .getSearchComponent("clustering");
@@ -114,18 +203,18 @@ public class CarrotClusteringEngineTest 
     return engine;
   }
 
-  private List checkEngine(CarrotClusteringEngine engine,
+  private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
                             int expectedNumClusters) throws IOException {
     return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), new ModifiableSolrParams());
   }
 
-  private List checkEngine(CarrotClusteringEngine engine,
+  private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
                             int expectedNumClusters, SolrParams clusteringParams) throws IOException {
     return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), clusteringParams);
   }
 
 
-  private List checkEngine(CarrotClusteringEngine engine, int expectedNumDocs,
+  private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine, int expectedNumDocs,
                            int expectedNumClusters, Query query, SolrParams clusteringParams) throws IOException {
     // Get all documents to cluster
     RefCounted<SolrIndexSearcher> ref = h.getCore().getSearcher();
@@ -145,7 +234,9 @@ public class CarrotClusteringEngineTest 
       LocalSolrQueryRequest req = new LocalSolrQueryRequest(h.getCore(), solrParams);
       Map<SolrDocument,Integer> docIds = new HashMap<SolrDocument, Integer>(docList.size());
       SolrDocumentList solrDocList = SolrPluginUtils.docListToSolrDocumentList( docList, searcher, engine.getFieldsToLoad(req), docIds );
-      List results = (List)engine.cluster(query, solrDocList, docIds, req);
+
+      @SuppressWarnings("unchecked")
+			List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req);
       req.close();
       assertEquals("number of clusters: " + results, expectedNumClusters, results.size());
       checkClusters(results, false);
@@ -155,51 +246,74 @@ public class CarrotClusteringEngineTest 
     }
   }
 
-  private void checkClusters(List results, int expectedDocCount,
+  private void checkClusters(List<NamedList<Object>> results, int expectedDocCount,
                              int expectedLabelCount, int expectedSubclusterCount) {
     for (int i = 0; i < results.size(); i++) {
-      NamedList cluster = (NamedList) results.get(i);
+      NamedList<Object> cluster = results.get(i);
       checkCluster(cluster, expectedDocCount, expectedLabelCount,
               expectedSubclusterCount);
     }
   }
 
-  private void checkClusters(List results, boolean hasSubclusters) {
+  private void checkClusters(List<NamedList<Object>> results, boolean hasSubclusters) {
     for (int i = 0; i < results.size(); i++) {
-      checkCluster((NamedList) results.get(i), hasSubclusters);
+      checkCluster(results.get(i), hasSubclusters);
     }
   }
 
-  private void checkCluster(NamedList cluster, boolean hasSubclusters) {
-    List docs = (List) cluster.get("docs");
+  private void checkCluster(NamedList<Object> cluster, boolean hasSubclusters) {
+    List<Object> docs = getDocs(cluster);
     assertNotNull("docs is null and it shouldn't be", docs);
     for (int j = 0; j < docs.size(); j++) {
       String id = (String) docs.get(j);
       assertNotNull("id is null and it shouldn't be", id);
     }
 
-    List labels = (List) cluster.get("labels");
+    List<String> labels = getLabels(cluster);
     assertNotNull("labels is null but it shouldn't be", labels);
 
     if (hasSubclusters) {
-      List subclusters = (List) cluster.get("clusters");
+      List<NamedList<Object>> subclusters = getSubclusters(cluster);
       assertNotNull("subclusters is null but it shouldn't be", subclusters);
     }
   }
 
-  private void checkCluster(NamedList cluster, int expectedDocCount,
+  private void checkCluster(NamedList<Object> cluster, int expectedDocCount,
                             int expectedLabelCount, int expectedSubclusterCount) {
     checkCluster(cluster, expectedSubclusterCount > 0);
     assertEquals("number of docs in cluster", expectedDocCount,
-            ((List) cluster.get("docs")).size());
+            getDocs(cluster).size());
     assertEquals("number of labels in cluster", expectedLabelCount,
-            ((List) cluster.get("labels")).size());
+            getLabels(cluster).size());
 
     if (expectedSubclusterCount > 0) {
-      List subclusters = (List) cluster.get("clusters");
+      List<NamedList<Object>> subclusters = getSubclusters(cluster);
       assertEquals("numClusters", expectedSubclusterCount, subclusters.size());
       assertEquals("number of subclusters in cluster",
               expectedSubclusterCount, subclusters.size());
     }
   }
+
+	@SuppressWarnings("unchecked")
+	private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) {
+		return (List<NamedList<Object>>) cluster.get("clusters");
+	}
+
+	@SuppressWarnings("unchecked")
+	private List<String> getLabels(NamedList<Object> cluster) {
+		return (List<String>) cluster.get("labels");
+	}
+
+	private Double getScore(NamedList<Object> cluster) {
+	  return (Double) cluster.get("score");
+	}
+
+	private Boolean isOtherTopics(NamedList<Object> cluster) {
+	  return (Boolean)cluster.get("other-topics");
+	}
+
+	@SuppressWarnings("unchecked")
+	private List<Object> getDocs(NamedList<Object> cluster) {
+		return (List<Object>) cluster.get("docs");
+	}
 }

Modified: lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java Sun May 22 21:45:19 2011
@@ -49,6 +49,11 @@ public class MockClusteringAlgorithm ext
   @IntRange(min = 1, max = 5)
   private int labels = 1;
 
+  @Input
+  @Processing
+  @Attribute
+  private int otherTopicsModulo = 0;
+
   @Override
   public void process() throws ProcessingException {
     clusters = Lists.newArrayList();
@@ -59,21 +64,26 @@ public class MockClusteringAlgorithm ext
     int documentIndex = 1;
     for (Document document : documents) {
       StringBuilder label = new StringBuilder("Cluster " + documentIndex);
-      Cluster cluster = createCluster(label.toString(), document);
+      Cluster cluster = createCluster(label.toString(), documentIndex, document);
       clusters.add(cluster);
       for (int i = 1; i <= depth; i++) {
         label.append(".");
         label.append(i);
-        Cluster newCluster = createCluster(label.toString(), document);
-        cluster.addSubclusters(createCluster(label.toString(), document), newCluster);
+        Cluster newCluster = createCluster(label.toString(), documentIndex, document);
+        cluster.addSubclusters(createCluster(label.toString(), documentIndex, document), newCluster);
         cluster = newCluster;
       }
       documentIndex++;
     }
   }
 
-  private Cluster createCluster(String labelBase, Document... documents) {
+  private Cluster createCluster(String labelBase, int documentIndex, Document... documents) {
     Cluster cluster = new Cluster();
+    cluster.setScore(documentIndex * 0.25);
+    if (otherTopicsModulo != 0 && documentIndex % otherTopicsModulo == 0)
+    {
+      cluster.setOtherTopics(true);
+    }
     for (int i = 0; i < labels; i++) {
       cluster.addPhrases(labelBase + "#" + (i + 1));
     }

Modified: lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml (original)
+++ lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml Sun May 22 21:45:19 2011
@@ -396,6 +396,15 @@
       <str name="name">mock</str>
       <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
     </lst>
+    <lst name="engine">
+      <str name="name">lexical-resource-check</str>
+      <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
+    </lst>
+    <lst name="engine">
+      <str name="name">lexical-resource-check-custom-resource-dir</str>
+      <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
+      <str name="carrot.lexicalResourcesDir">clustering/custom</str>
+    </lst>
   </searchComponent>
 
   <searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="doc-clustering">

Modified: lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt (original)
+++ lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt Sun May 22 21:45:19 2011
@@ -55,4 +55,5 @@ to
 was
 will
 with
+solrownstopword
 

Modified: lucene/dev/branches/solr2452/solr/contrib/contrib-build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/contrib-build.xml?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/contrib-build.xml (original)
+++ lucene/dev/branches/solr2452/solr/contrib/contrib-build.xml Sun May 22 21:45:19 2011
@@ -20,7 +20,7 @@
 <project name="solr-contrib-build">
   <!-- TODO: adjust build.dir/dist.dir appropriately when a contrib project is run individually -->
   <property name="build.dir" location="../../build/contrib/${ant.project.name}"/>
-  <property name="dist.dir" location="../../dist/"/>
+  <property name="dist.dir" location="../../dist"/>
   	
   <import file="../common-build.xml"/>
 
@@ -39,7 +39,7 @@
     <!-- set jarfile only, if the target jar file has no generic name -->
     <attribute name="jarfile" default="${common-solr.dir}/build/contrib/solr-@{name}/apache-solr-@{name}-${version}.jar"/>
     <sequential>
-      <!--<echo message="Checking '@{jarfile}' against source folder '${common.dir}/contrib/@{name}/src/java'"/>-->
+      <!--<echo message="Checking '@{jarfile}' against source folder '${common.dir}/contrib/@{name}/src/main/java'"/>-->
       <property name="@{classpath.property}" location="@{jarfile}"/>
       <uptodate property="@{property}" targetfile="@{jarfile}">
         <srcfiles dir="../@{name}/src/main/java" includes="**/*.java"/>

Modified: lucene/dev/branches/solr2452/solr/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DataImportHandler.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DataImportHandler.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DataImportHandler.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DataImportHandler.java Sun May 22 21:45:19 2011
@@ -194,7 +194,7 @@ public class DataImportHandler extends R
               IMPORT_CMD.equals(command)) {
 
         UpdateRequestProcessorChain processorChain =
-                req.getCore().getUpdateProcessingChain(params.get(UpdateParams.UPDATE_PROCESSOR));
+                req.getCore().getUpdateProcessingChain(params.get(UpdateParams.UPDATE_CHAIN));
         UpdateRequestProcessor processor = processorChain.createProcessor(req, rsp);
         SolrResourceLoader loader = req.getCore().getResourceLoader();
         SolrWriter sw = getSolrWriter(processor, loader, requestParams, req);

Modified: lucene/dev/branches/solr2452/solr/contrib/extraction/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/extraction/CHANGES.txt?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/extraction/CHANGES.txt (original)
+++ lucene/dev/branches/solr2452/solr/contrib/extraction/CHANGES.txt Sun May 22 21:45:19 2011
@@ -22,7 +22,7 @@ to your Solr Home lib directory.  See ht
 
 Current Version: Tika 0.8 (released 11/07/2010)
 
-$Id:$
+$Id$
 
 ================== Release 4.0-dev ==================
 
@@ -30,7 +30,8 @@ $Id:$
 
 ================== Release 3.2-dev ==================
 
-(No Changes)
+* SOLR-2480: Add ignoreTikaException flag so that users can ignore TikaException but index
+  meta data. (Shinichiro Abe, koji)
 
 ================== Release 3.1-dev ==================
 

Modified: lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java Sun May 22 21:45:19 2011
@@ -16,20 +16,27 @@
  */
 package org.apache.solr.handler.extraction;
 
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.Locale;
+
 import org.apache.commons.io.IOUtils;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.params.UpdateParams;
 import org.apache.solr.common.util.ContentStream;
 import org.apache.solr.common.util.NamedList;
+import org.apache.solr.handler.ContentStreamLoader;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.response.SolrQueryResponse;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.update.AddUpdateCommand;
 import org.apache.solr.update.processor.UpdateRequestProcessor;
-import org.apache.solr.handler.ContentStreamLoader;
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
@@ -37,26 +44,24 @@ import org.apache.tika.sax.XHTMLContentH
 import org.apache.tika.sax.xpath.Matcher;
 import org.apache.tika.sax.xpath.MatchingContentHandler;
 import org.apache.tika.sax.xpath.XPathParser;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.mime.MediaType;
-import org.apache.xml.serialize.OutputFormat;
 import org.apache.xml.serialize.BaseMarkupSerializer;
-import org.apache.xml.serialize.XMLSerializer;
+import org.apache.xml.serialize.OutputFormat;
 import org.apache.xml.serialize.TextSerializer;
+import org.apache.xml.serialize.XMLSerializer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.util.Locale;
-
 
 /**
  * The class responsible for loading extracted content into Solr.
  *
  **/
 public class ExtractingDocumentLoader extends ContentStreamLoader {
+
+  private static final Logger log = LoggerFactory.getLogger(ExtractingDocumentLoader.class);
+
   /**
    * Extract Only supported format
    */
@@ -74,6 +79,7 @@ public class ExtractingDocumentLoader ex
   final IndexSchema schema;
   final SolrParams params;
   final UpdateRequestProcessor processor;
+  final boolean ignoreTikaException;
   protected AutoDetectParser autoDetectParser;
 
   private final AddUpdateCommand templateAdd;
@@ -95,6 +101,8 @@ public class ExtractingDocumentLoader ex
     //this is lightweight
     autoDetectParser = new AutoDetectParser(config);
     this.factory = factory;
+    
+    ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false);
   }
 
 
@@ -180,9 +188,17 @@ public class ExtractingDocumentLoader ex
           parsingHandler = new MatchingContentHandler(handler, matcher);
         } //else leave it as is
 
-        //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
-        ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
-        parser.parse(inputStream, parsingHandler, metadata, context);
+        try{
+          //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
+          ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
+          parser.parse(inputStream, parsingHandler, metadata, context);
+        } catch (TikaException e) {
+          if(ignoreTikaException)
+            log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage())
+                .append(". metadata=").append(metadata.toString()).toString());
+          else
+            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+        }
         if (extractOnly == false) {
           addDoc(handler);
         } else {
@@ -202,8 +218,6 @@ public class ExtractingDocumentLoader ex
         }
       } catch (SAXException e) {
         throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
-      } catch (TikaException e) {
-        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
       } finally {
         IOUtils.closeQuietly(inputStream);
       }

Modified: lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java Sun May 22 21:45:19 2011
@@ -28,6 +28,11 @@ public interface ExtractingParams {
    */
   public static final String LOWERNAMES = "lowernames";
 
+  /**
+   * if true, ignore TikaException (give up to extract text but index meta data)
+   */
+  public static final String IGNORE_TIKA_EXCEPTION = "ignoreTikaException";
+
 
   /**
    * The param prefix for mapping Tika metadata to Solr fields.

Modified: lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java Sun May 22 21:45:19 2011
@@ -47,23 +47,23 @@ import java.util.*;
  */
 public class SolrContentHandler extends DefaultHandler implements ExtractingParams {
   private transient static Logger log = LoggerFactory.getLogger(SolrContentHandler.class);
-  private SolrInputDocument document;
+  protected SolrInputDocument document;
 
-  private Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
+  protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
 
-  private Metadata metadata;
-  private SolrParams params;
-  private StringBuilder catchAllBuilder = new StringBuilder(2048);
-  private IndexSchema schema;
-  private Map<String, StringBuilder> fieldBuilders = Collections.emptyMap();
+  protected Metadata metadata;
+  protected SolrParams params;
+  protected StringBuilder catchAllBuilder = new StringBuilder(2048);
+  protected IndexSchema schema;
+  protected Map<String, StringBuilder> fieldBuilders = Collections.emptyMap();
   private LinkedList<StringBuilder> bldrStack = new LinkedList<StringBuilder>();
 
-  private boolean captureAttribs;
-  private boolean lowerNames;
-  private String contentFieldName = "content";
+  protected boolean captureAttribs;
+  protected boolean lowerNames;
+  protected String contentFieldName = "content";
 
-  private String unknownFieldPrefix = "";
-  private String defaultField = "";
+  protected String unknownFieldPrefix = "";
+  protected String defaultField = "";
 
   public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
     this(metadata, params, schema, DateUtil.DEFAULT_DATE_FORMATS);
@@ -99,46 +99,82 @@ public class SolrContentHandler extends 
    * The base implementation adds the metadata as fields, allowing for potential remapping.
    *
    * @return The {@link org.apache.solr.common.SolrInputDocument}.
+   *
+   * @see #addMetadata()
+   * @see #addCapturedContent()
+   * @see #addContent()
+   * @see #addLiterals()
    */
   public SolrInputDocument newDocument() {
     float boost = 1.0f;
     //handle the metadata extracted from the document
-    for (String name : metadata.names()) {
-      String[] vals = metadata.getValues(name);
-      addField(name, null, vals);
-    }
+    addMetadata();
 
     //handle the literals from the params
-    Iterator<String> paramNames = params.getParameterNamesIterator();
-    while (paramNames.hasNext()) {
-      String pname = paramNames.next();
-      if (!pname.startsWith(LITERALS_PREFIX)) continue;
-
-      String name = pname.substring(LITERALS_PREFIX.length());
-      addField(name, null, params.getParams(pname));
-    }
+    addLiterals();
 
 
     //add in the content
-    addField(contentFieldName, catchAllBuilder.toString(), null);
+    addContent();
 
     //add in the captured content
+    addCapturedContent();
+
+    if (log.isDebugEnabled()) {
+      log.debug("Doc: {}", document);
+    }
+    return document;
+  }
+
+  /**
+   * Add the per field captured content to the Solr Document.  Default implementation uses the
+   * {@link #fieldBuilders} info
+   */
+  protected void addCapturedContent() {
     for (Map.Entry<String, StringBuilder> entry : fieldBuilders.entrySet()) {
       if (entry.getValue().length() > 0) {
         addField(entry.getKey(), entry.getValue().toString(), null);
       }
     }
-    if (log.isDebugEnabled()) {
-      log.debug("Doc: " + document);
+  }
+
+  /**
+   * Add in the catch all content to the field.  Default impl. uses the {@link #contentFieldName}
+   * and the {@link #catchAllBuilder}
+   */
+  protected void addContent() {
+    addField(contentFieldName, catchAllBuilder.toString(), null);
+  }
+
+  /**
+   * Add in the literals to the document using the {@link #params} and the {@link #LITERALS_PREFIX}.
+   */
+  protected void addLiterals() {
+    Iterator<String> paramNames = params.getParameterNamesIterator();
+    while (paramNames.hasNext()) {
+      String pname = paramNames.next();
+      if (!pname.startsWith(LITERALS_PREFIX)) continue;
+
+      String name = pname.substring(LITERALS_PREFIX.length());
+      addField(name, null, params.getParams(pname));
+    }
+  }
+
+  /**
+   * Add in any metadata using {@link #metadata} as the source.
+   */
+  protected void addMetadata() {
+    for (String name : metadata.names()) {
+      String[] vals = metadata.getValues(name);
+      addField(name, null, vals);
     }
-    return document;
   }
 
   // Naming rules:
   // 1) optionally map names to nicenames (lowercase+underscores)
   // 2) execute "map" commands
   // 3) if resulting field is unknown, map it to a common prefix
-  private void addField(String fname, String fval, String[] vals) {
+  protected void addField(String fname, String fval, String[] vals) {
     if (lowerNames) {
       StringBuilder sb = new StringBuilder();
       for (int i=0; i<fname.length(); i++) {

Modified: lucene/dev/branches/solr2452/solr/contrib/uima/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/uima/CHANGES.txt?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/uima/CHANGES.txt (original)
+++ lucene/dev/branches/solr2452/solr/contrib/uima/CHANGES.txt Sun May 22 21:45:19 2011
@@ -21,11 +21,33 @@ $Id$
 
 ==================  3.2.0-dev ==================
 
+Upgrading from Solr 3.1
+----------------------
+
+* <uimaConfig/> just beneath <config> ... </config> is no longer supported.
+  It should move to UIMAUpdateRequestProcessorFactory setting.
+  See contrib/uima/README.txt for more details. (SOLR-2436)
+
+New Features
+----------------------
+
+* SOLR-2503: extend mapping function to map feature value to dynamicField. (koji)
+
+* SOLR-2512: add ignoreErrors flag so that users can ignore exceptions in AE.
+  (Tommaso Teofili, koji)
+
 Test Cases:
+----------------------
+
+* SOLR-2387: add mock annotators for improved testing,
+  (Tommaso Teofili via rmuir)
+
+Other Changes
+----------------------
 
- * SOLR-2387: add mock annotators for improved testing,
-   (Tommaso Teofili via rmuir)
+* SOLR-2436: move uimaConfig to under the uima's update processor in solrconfig.xml.
+  (Tommaso Teofili, koji)
 
-==================  3.1.0-dev ==================
+==================  3.1.0 ==================
 
 Initial Release

Modified: lucene/dev/branches/solr2452/solr/contrib/uima/README.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/uima/README.txt?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/uima/README.txt (original)
+++ lucene/dev/branches/solr2452/solr/contrib/uima/README.txt Sun May 22 21:45:19 2011
@@ -3,38 +3,74 @@ Getting Started
 To start using Solr UIMA Metadata Extraction Library you should go through the following configuration steps:
 
 1. copy generated solr-uima jar and its libs (under contrib/uima/lib) inside a Solr libraries directory.
+   or set <lib/> tags in solrconfig.xml appropriately to point those jar files.
+
+   <lib dir="../../contrib/uima/lib" />
+   <lib dir="../../dist/" regex="apache-solr-uima-\d.*\.jar" />
 
 2. modify your schema.xml adding the fields you want to be hold metadata specifying proper values for type, indexed, stored and multiValued options:
 
-3. for example you could specify the following
+   for example you could specify the following
+
   <field name="language" type="string" indexed="true" stored="true" required="false"/>
   <field name="concept" type="string" indexed="true" stored="true" multiValued="true" required="false"/>
   <field name="sentence" type="text" indexed="true" stored="true" multiValued="true" required="false" />
 
-4. modify your solrconfig.xml adding the following snippet:
-  <uimaConfig>
-    <runtimeParameters>
-      <keyword_apikey>VALID_ALCHEMYAPI_KEY</keyword_apikey>
-      <concept_apikey>VALID_ALCHEMYAPI_KEY</concept_apikey>
-      <lang_apikey>VALID_ALCHEMYAPI_KEY</lang_apikey>
-      <cat_apikey>VALID_ALCHEMYAPI_KEY</cat_apikey>
-      <entities_apikey>VALID_ALCHEMYAPI_KEY</entities_apikey>
-      <oc_licenseID>VALID_OPENCALAIS_KEY</oc_licenseID>
-    </runtimeParameters>
-    <analysisEngine>/org/apache/uima/desc/OverridingParamsExtServicesAE.xml</analysisEngine>
-    <analyzeFields merge="false">text</analyzeFields>
-    <fieldMapping>
-      <type name="org.apache.uima.alchemy.ts.concept.ConceptFS">
-       <map feature="text" field="concept"/>
-      </type>
-      <type name="org.apache.uima.alchemy.ts.language.LanguageFS">
-       <map feature="language" field="language"/>
-      </type>
-      <type name="org.apache.uima.SentenceAnnotation">
-        <map feature="coveredText" field="sentence"/>
-      </type>
-    </fieldMapping>
-  </uimaConfig>
+3. modify your solrconfig.xml adding the following snippet:
+
+  <updateRequestProcessorChain name="uima">
+    <processor class="org.apache.solr.uima.processor.UIMAUpdateRequestProcessorFactory">
+      <lst name="uimaConfig">
+        <lst name="runtimeParameters">
+          <str name="keyword_apikey">VALID_ALCHEMYAPI_KEY</str>
+          <str name="concept_apikey">VALID_ALCHEMYAPI_KEY</str>
+          <str name="lang_apikey">VALID_ALCHEMYAPI_KEY</str>
+          <str name="cat_apikey">VALID_ALCHEMYAPI_KEY</str>
+          <str name="entities_apikey">VALID_ALCHEMYAPI_KEY</str>
+          <str name="oc_licenseID">VALID_OPENCALAIS_KEY</str>
+        </lst>
+        <str name="analysisEngine">/org/apache/uima/desc/OverridingParamsExtServicesAE.xml</str>
+        <!-- Set to true if you want to continue indexing even if text processing fails.
+             Default is false. That is, Solr throws RuntimeException and
+             never indexed documents entirely in your session. -->
+        <bool name="ignoreErrors">true</bool>
+        <!-- This is optional. It is used for logging when text processing fails.
+             Usually, set uniqueKey field name -->
+        <str name="logField">id</str>
+        <lst name="analyzeFields">
+          <bool name="merge">false</bool>
+          <arr name="fields">
+            <str>text</str>
+          </arr>
+        </lst>
+        <lst name="fieldMappings">
+          <lst name="type">
+            <str name="name">org.apache.uima.alchemy.ts.concept.ConceptFS</str>
+            <lst name="mapping">
+              <str name="feature">text</str>
+              <str name="field">concept</str>
+            </lst>
+          </lst>
+          <lst name="type">
+            <str name="name">org.apache.uima.alchemy.ts.language.LanguageFS</str>
+            <lst name="mapping">
+              <str name="feature">language</str>
+              <str name="field">language</str>
+            </lst>
+          </lst>
+          <lst name="type">
+            <str name="name">org.apache.uima.SentenceAnnotation</str>
+            <lst name="mapping">
+              <str name="feature">coveredText</str>
+              <str name="field">sentence</str>
+            </lst>
+          </lst>
+        </lst>
+      </lst>
+    </processor>
+    <processor class="solr.LogUpdateProcessorFactory" />
+    <processor class="solr.RunUpdateProcessorFactory" />
+  </updateRequestProcessorChain>
 
    where VALID_ALCHEMYAPI_KEY is your AlchemyAPI Access Key. You need to register AlchemyAPI Access
    key to exploit the AlchemyAPI services: http://www.alchemyapi.com/api/register.html
@@ -42,21 +78,14 @@ To start using Solr UIMA Metadata Extrac
    where VALID_OPENCALAIS_KEY is your Calais Service Key. You need to register Calais Service
    key to exploit the Calais services: http://www.opencalais.com/apikey
   
-5. the analysisEngine tag must contain an AE descriptor inside the specified path in the classpath
+   the analysisEngine must contain an AE descriptor inside the specified path in the classpath
 
-6. the analyzeFields tag must contain the input fields that need to be analyzed by UIMA,
+   the analyzeFields must contain the input fields that need to be analyzed by UIMA,
    if merge=true then their content will be merged and analyzed only once
 
-7. field mapping describes which features of which types should go in a field
-
-8. define in your solrconfig.xml an UpdateRequestProcessorChain as following:
-  <updateRequestProcessorChain name="uima">
-    <processor class="org.apache.solr.uima.processor.UIMAUpdateRequestProcessorFactory"/>
-    <processor class="solr.LogUpdateProcessorFactory" />
-    <processor class="solr.RunUpdateProcessorFactory" />
-  </updateRequestProcessorChain>
+   field mapping describes which features of which types should go in a field
 
-9. in your solrconfig.xml replace the existing default (<requestHandler name="/update"...)  or create a new UpdateRequestHandler with the following:
+4. in your solrconfig.xml replace the existing default (<requestHandler name="/update"...)  or create a new UpdateRequestHandler with the following:
   <requestHandler name="/update" class="solr.XmlUpdateRequestHandler">
     <lst name="defaults">
       <str name="update.processor">uima</str>

Modified: lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfiguration.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfiguration.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfiguration.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfiguration.java Sun May 22 21:45:19 2011
@@ -21,7 +21,7 @@ import java.util.Map;
 
 /**
  * Configuration holding all the configurable parameters for calling UIMA inside Solr
- * 
+ *
  * @version $Id$
  */
 public class SolrUIMAConfiguration {
@@ -30,20 +30,26 @@ public class SolrUIMAConfiguration {
 
   private boolean fieldsMerging;
 
-  private Map<String, Map<String, String>> typesFeaturesFieldsMapping;
+  private Map<String, Map<String, MapField>> typesFeaturesFieldsMapping;
 
   private String aePath;
 
   private Map<String, Object> runtimeParameters;
 
+  private boolean ignoreErrors;
+  
+  private String logField;
+
   public SolrUIMAConfiguration(String aePath, String[] fieldsToAnalyze, boolean fieldsMerging,
-          Map<String, Map<String, String>> typesFeaturesFieldsMapping,
-          Map<String, Object> runtimeParameters) {
+          Map<String, Map<String, MapField>> typesFeaturesFieldsMapping,
+          Map<String, Object> runtimeParameters, boolean ignoreErrors, String logField) {
     this.aePath = aePath;
     this.fieldsToAnalyze = fieldsToAnalyze;
     this.fieldsMerging = fieldsMerging;
     this.runtimeParameters = runtimeParameters;
     this.typesFeaturesFieldsMapping = typesFeaturesFieldsMapping;
+    this.ignoreErrors = ignoreErrors;
+    this.logField = logField;
   }
 
   public String[] getFieldsToAnalyze() {
@@ -54,7 +60,7 @@ public class SolrUIMAConfiguration {
     return fieldsMerging;
   }
 
-  public Map<String, Map<String, String>> getTypesFeaturesFieldsMapping() {
+  public Map<String, Map<String, MapField>> getTypesFeaturesFieldsMapping() {
     return typesFeaturesFieldsMapping;
   }
 
@@ -66,4 +72,46 @@ public class SolrUIMAConfiguration {
     return runtimeParameters;
   }
 
+  public boolean isIgnoreErrors() {
+    return ignoreErrors;
+  }
+  
+  public String getLogField(){
+    return logField;
+  }
+  
+  static final class MapField {
+    
+    private String fieldName, fieldNameFeature;
+    private boolean prefix; // valid if dynamicField == true
+                            // false: *_s, true: s_*
+    
+    MapField(String fieldName, String fieldNameFeature){
+      this.fieldName = fieldName;
+      this.fieldNameFeature = fieldNameFeature;
+      if(fieldNameFeature != null){
+        if(fieldName.startsWith("*")){
+          prefix = false;
+          this.fieldName = fieldName.substring(1);
+        }
+        else if(fieldName.endsWith("*")){
+          prefix = true;
+          this.fieldName = fieldName.substring(0, fieldName.length() - 1);
+        }
+        else
+          throw new RuntimeException("static field name cannot be used for dynamicField");
+      }
+    }
+    
+    String getFieldNameFeature(){
+      return fieldNameFeature;
+    }
+    
+    String getFieldName(String featureValue){
+      if(fieldNameFeature != null){
+        return prefix ? fieldName + featureValue : featureValue + fieldName;
+      }
+      return fieldName;
+    }
+  }
 }

Modified: lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfigurationReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfigurationReader.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfigurationReader.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfigurationReader.java Sun May 22 21:45:19 2011
@@ -18,11 +18,11 @@ package org.apache.solr.uima.processor;
  */
 
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 
-import org.apache.solr.core.SolrConfig;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.uima.processor.SolrUIMAConfiguration.MapField;
 
 /**
  * Read configuration for Solr-UIMA integration
@@ -32,94 +32,86 @@ import org.w3c.dom.NodeList;
  */
 public class SolrUIMAConfigurationReader {
 
-  private static final String AE_RUNTIME_PARAMETERS_NODE_PATH = "/config/uimaConfig/runtimeParameters";
+  private NamedList<Object> args;
 
-  private static final String FIELD_MAPPING_NODE_PATH = "/config/uimaConfig/fieldMapping";
-
-  private static final String ANALYZE_FIELDS_NODE_PATH = "/config/uimaConfig/analyzeFields";
-
-  private static final String ANALYSIS_ENGINE_NODE_PATH = "/config/uimaConfig/analysisEngine";
-
-  private SolrConfig solrConfig;
-
-  public SolrUIMAConfigurationReader(SolrConfig solrConfig) {
-    this.solrConfig = solrConfig;
+  public SolrUIMAConfigurationReader(NamedList<Object> args) {
+    this.args = args;
   }
 
   public SolrUIMAConfiguration readSolrUIMAConfiguration() {
     return new SolrUIMAConfiguration(readAEPath(), readFieldsToAnalyze(), readFieldsMerging(),
-            readTypesFeaturesFieldsMapping(), readAEOverridingParameters());
+            readTypesFeaturesFieldsMapping(), readAEOverridingParameters(), readIgnoreErrors(),
+            readLogField());
   }
 
   private String readAEPath() {
-    return solrConfig.getNode(ANALYSIS_ENGINE_NODE_PATH, true).getTextContent();
+    return (String) args.get("analysisEngine");
   }
 
+  @SuppressWarnings("rawtypes")
+  private NamedList getAnalyzeFields() {
+    return (NamedList) args.get("analyzeFields");
+  }
+
+  @SuppressWarnings("unchecked")
   private String[] readFieldsToAnalyze() {
-    Node analyzeFieldsNode = solrConfig.getNode(ANALYZE_FIELDS_NODE_PATH, true);
-    return analyzeFieldsNode.getTextContent().split(",");
+    List<String> fields = (List<String>) getAnalyzeFields().get("fields");
+    return fields.toArray(new String[fields.size()]);
   }
 
   private boolean readFieldsMerging() {
-    Node analyzeFieldsNode = solrConfig.getNode(ANALYZE_FIELDS_NODE_PATH, true);
-    Node mergeNode = analyzeFieldsNode.getAttributes().getNamedItem("merge");
-    return Boolean.valueOf(mergeNode.getNodeValue());
+    return (Boolean) getAnalyzeFields().get("merge");
   }
 
-  private Map<String, Map<String, String>> readTypesFeaturesFieldsMapping() {
-    Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
+  @SuppressWarnings("rawtypes")
+  private Map<String, Map<String, MapField>> readTypesFeaturesFieldsMapping() {
+    Map<String, Map<String, MapField>> map = new HashMap<String, Map<String, MapField>>();
 
-    Node fieldMappingNode = solrConfig.getNode(FIELD_MAPPING_NODE_PATH, true);
+    NamedList fieldMappings = (NamedList) args.get("fieldMappings");
     /* iterate over UIMA types */
-    if (fieldMappingNode.hasChildNodes()) {
-      NodeList typeNodes = fieldMappingNode.getChildNodes();
-      for (int i = 0; i < typeNodes.getLength(); i++) {
-        /* <type> node */
-        Node typeNode = typeNodes.item(i);
-        if (typeNode.getNodeType() != Node.TEXT_NODE) {
-          Node typeNameAttribute = typeNode.getAttributes().getNamedItem("name");
-          /* get a UIMA typename */
-          String typeName = typeNameAttribute.getNodeValue();
-          /* create entry for UIMA type */
-          map.put(typeName, new HashMap<String, String>());
-          if (typeNode.hasChildNodes()) {
-            /* iterate over features */
-            NodeList featuresNodeList = typeNode.getChildNodes();
-            for (int j = 0; j < featuresNodeList.getLength(); j++) {
-              Node mappingNode = featuresNodeList.item(j);
-              if (mappingNode.getNodeType() != Node.TEXT_NODE) {
-                /* get field name */
-                Node fieldNameNode = mappingNode.getAttributes().getNamedItem("field");
-                String mappedFieldName = fieldNameNode.getNodeValue();
-                /* get feature name */
-                Node featureNameNode = mappingNode.getAttributes().getNamedItem("feature");
-                String featureName = featureNameNode.getNodeValue();
-                /* map the feature to the field for the specified type */
-                map.get(typeName).put(featureName, mappedFieldName);
-              }
-            }
-          }
+    for (int i = 0; i < fieldMappings.size(); i++) {
+      NamedList type = (NamedList) fieldMappings.get("type", i);
+      String typeName = (String)type.get("name");
+
+      Map<String, MapField> subMap = new HashMap<String, MapField>();
+      /* iterate over mapping definitions */
+      for(int j = 0; j < type.size() - 1; j++){
+        NamedList mapping = (NamedList) type.get("mapping", j + 1);
+        String featureName = (String) mapping.get("feature");
+        String fieldNameFeature = null;
+        String mappedFieldName = (String) mapping.get("field");
+        if(mappedFieldName == null){
+          fieldNameFeature = (String) mapping.get("fieldNameFeature");
+          mappedFieldName = (String) mapping.get("dynamicField");
         }
+        if(mappedFieldName == null)
+          throw new RuntimeException("either of field or dynamicField should be defined for feature " + featureName);
+        MapField mapField = new MapField(mappedFieldName, fieldNameFeature);
+        subMap.put(featureName, mapField);
       }
+      map.put(typeName, subMap);
     }
     return map;
   }
 
+  @SuppressWarnings("rawtypes")
   private Map<String, Object> readAEOverridingParameters() {
     Map<String, Object> runtimeParameters = new HashMap<String, Object>();
-    Node uimaConfigNode = solrConfig.getNode(AE_RUNTIME_PARAMETERS_NODE_PATH, true);
-
-    if (uimaConfigNode.hasChildNodes()) {
-      NodeList overridingNodes = uimaConfigNode.getChildNodes();
-      for (int i = 0; i < overridingNodes.getLength(); i++) {
-        Node overridingNode = overridingNodes.item(i);
-        if (overridingNode.getNodeType() != Node.TEXT_NODE && overridingNode.getNodeType() != Node.COMMENT_NODE) {
-          runtimeParameters.put(overridingNode.getNodeName(), overridingNode.getTextContent());
-        }
-      }
+    NamedList runtimeParams = (NamedList) args.get("runtimeParameters");
+    for (int i = 0; i < runtimeParams.size(); i++) {
+      String name = runtimeParams.getName(i);
+      Object value = runtimeParams.getVal(i);
+      runtimeParameters.put(name, value);
     }
-
     return runtimeParameters;
   }
 
+  private boolean readIgnoreErrors() {
+    Object ignoreErrors = args.get("ignoreErrors");
+    return ignoreErrors == null ? false : (Boolean)ignoreErrors;
+  }
+
+  private String readLogField() {
+    return (String)args.get("logField");
+  }
 }

Modified: lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAToSolrMapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAToSolrMapper.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAToSolrMapper.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAToSolrMapper.java Sun May 22 21:45:19 2011
@@ -20,6 +20,7 @@ package org.apache.solr.uima.processor;
 import java.util.Map;
 
 import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.uima.processor.SolrUIMAConfiguration.MapField;
 import org.apache.uima.cas.FSIterator;
 import org.apache.uima.cas.FeatureStructure;
 import org.apache.uima.cas.Type;
@@ -53,7 +54,7 @@ public class UIMAToSolrMapper {
    *          name of UIMA type to map
    * @param featureFieldsmapping
    */
-  public void map(String typeName, Map<String, String> featureFieldsmapping) {
+  public void map(String typeName, Map<String, MapField> featureFieldsmapping) {
     try {
       FeatureStructure fsMock = (FeatureStructure) Class.forName(typeName).getConstructor(
               JCas.class).newInstance(cas);
@@ -62,7 +63,11 @@ public class UIMAToSolrMapper {
               .hasNext();) {
         FeatureStructure fs = iterator.next();
         for (String featureName : featureFieldsmapping.keySet()) {
-          String fieldName = featureFieldsmapping.get(featureName);
+          MapField mapField = featureFieldsmapping.get(featureName);
+          String fieldNameFeature = mapField.getFieldNameFeature();
+          String fieldNameFeatureValue = fieldNameFeature == null ? null :
+            fs.getFeatureValueAsString(type.getFeatureByBaseName(fieldNameFeature));
+          String fieldName = mapField.getFieldName(fieldNameFeatureValue);
           log.info(new StringBuffer("mapping ").append(typeName).append("@").append(featureName)
                   .append(" to ").append(fieldName).toString());
           String featureValue = null;

Modified: lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessor.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessor.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessor.java Sun May 22 21:45:19 2011
@@ -20,8 +20,11 @@ package org.apache.solr.uima.processor;
 import java.io.IOException;
 import java.util.Map;
 
+import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.core.SolrCore;
+import org.apache.solr.uima.processor.SolrUIMAConfiguration.MapField;
 import org.apache.solr.uima.processor.ae.AEProvider;
 import org.apache.solr.uima.processor.ae.AEProviderFactory;
 import org.apache.solr.update.AddUpdateCommand;
@@ -34,43 +37,45 @@ import org.apache.uima.resource.Resource
 
 /**
  * Update document(s) to be indexed with UIMA extracted information
- * 
+ *
  * @version $Id$
  */
 public class UIMAUpdateRequestProcessor extends UpdateRequestProcessor {
 
-  private SolrUIMAConfiguration solrUIMAConfiguration;
+  SolrUIMAConfiguration solrUIMAConfiguration;
 
   private AEProvider aeProvider;
 
-  public UIMAUpdateRequestProcessor(UpdateRequestProcessor next, SolrCore solrCore) {
+  public UIMAUpdateRequestProcessor(UpdateRequestProcessor next, SolrCore solrCore,
+      SolrUIMAConfiguration config) {
     super(next);
-    initialize(solrCore);
+    initialize(solrCore, config);
   }
 
-  private void initialize(SolrCore solrCore) {
-    SolrUIMAConfigurationReader uimaConfigurationReader = new SolrUIMAConfigurationReader(solrCore
-            .getSolrConfig());
-    solrUIMAConfiguration = uimaConfigurationReader.readSolrUIMAConfiguration();
+  private void initialize(SolrCore solrCore, SolrUIMAConfiguration config) {
+    solrUIMAConfiguration = config;
     aeProvider = AEProviderFactory.getInstance().getAEProvider(solrCore.getName(),
             solrUIMAConfiguration.getAePath(), solrUIMAConfiguration.getRuntimeParameters());
   }
 
   @Override
   public void processAdd(AddUpdateCommand cmd) throws IOException {
+    String text = null;
     try {
       /* get Solr document */
       SolrInputDocument solrInputDocument = cmd.getSolrInputDocument();
 
       /* get the fields to analyze */
-      for (String text : getTextsToAnalyze(solrInputDocument)) {
+      String[] texts = getTextsToAnalyze(solrInputDocument);
+      for (int i = 0; i < texts.length; i++) {
+        text = texts[i];
         if (text != null && !"".equals(text)) {
           /* process the text value */
           JCas jcas = processText(text);
 
           UIMAToSolrMapper uimaToSolrMapper = new UIMAToSolrMapper(solrInputDocument, jcas);
           /* get field mapping from config */
-          Map<String, Map<String, String>> typesAndFeaturesFieldsMap = solrUIMAConfiguration
+          Map<String, Map<String, MapField>> typesAndFeaturesFieldsMap = solrUIMAConfiguration
                   .getTypesFeaturesFieldsMapping();
           /* map type features on fields */
           for (String typeFQN : typesAndFeaturesFieldsMap.keySet()) {
@@ -79,7 +84,21 @@ public class UIMAUpdateRequestProcessor 
         }
       }
     } catch (UIMAException e) {
-      throw new RuntimeException(e);
+      String logField = solrUIMAConfiguration.getLogField();
+      String optionalFieldInfo = logField == null ? "." :
+        new StringBuilder(". ").append(logField).append("=")
+        .append((String)cmd.getSolrInputDocument().getField(logField).getValue())
+        .append(", ").toString();
+      if (solrUIMAConfiguration.isIgnoreErrors())
+        log.warn(new StringBuilder("skip the text processing due to ")
+          .append(e.getLocalizedMessage()).append(optionalFieldInfo)
+          .append(" text=\"").append(text.substring(0, 100)).append("...\"").toString());
+      else{
+        throw new SolrException(ErrorCode.SERVER_ERROR,
+            new StringBuilder("processing error: ")
+              .append(e.getLocalizedMessage()).append(optionalFieldInfo)
+              .append(" text=\"").append(text.substring(0, 100)).append("...\"").toString(), e);
+      }
     }
     super.processAdd(cmd);
   }

Modified: lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorFactory.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorFactory.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorFactory.java Sun May 22 21:45:19 2011
@@ -17,6 +17,7 @@ package org.apache.solr.uima.processor;
  * limitations under the License.
  */
 
+import org.apache.solr.common.util.NamedList;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.response.SolrQueryResponse;
 import org.apache.solr.update.processor.UpdateRequestProcessor;
@@ -29,10 +30,19 @@ import org.apache.solr.update.processor.
  */
 public class UIMAUpdateRequestProcessorFactory extends UpdateRequestProcessorFactory {
 
+  private NamedList<Object> args;
+
+  @SuppressWarnings("unchecked")
+  @Override
+  public void init(@SuppressWarnings("rawtypes") NamedList args) {
+    this.args = (NamedList<Object>) args.get("uimaConfig");
+  }
+
   @Override
   public UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp,
           UpdateRequestProcessor next) {
-    return new UIMAUpdateRequestProcessor(next, req.getCore());
+    return new UIMAUpdateRequestProcessor(next, req.getCore(),
+            new SolrUIMAConfigurationReader(args).readSolrUIMAConfiguration());
   }
 
 }

Modified: lucene/dev/branches/solr2452/solr/contrib/uima/src/main/resources/solr/conf/aggregate-uima-config.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/uima/src/main/resources/solr/conf/aggregate-uima-config.xml?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/uima/src/main/resources/solr/conf/aggregate-uima-config.xml (original)
+++ lucene/dev/branches/solr2452/solr/contrib/uima/src/main/resources/solr/conf/aggregate-uima-config.xml Sun May 22 21:45:19 2011
@@ -15,19 +15,34 @@
     limitations under the License.
   -->
 
-<uimaConfig>
-  <runtimeParameters>
-    <keyword_apikey>VALID_ALCHEMYAPI_KEY</keyword_apikey>
-    <concept_apikey>VALID_ALCHEMYAPI_KEY</concept_apikey>
-    <lang_apikey>VALID_ALCHEMYAPI_KEY</lang_apikey>
-    <cat_apikey>VALID_ALCHEMYAPI_KEY</cat_apikey>
-    <oc_licenseID>VALID_OPENCALAIS_KEY</oc_licenseID>
-  </runtimeParameters>
-  <analysisEngine>/org/apache/uima/desc/OverridingParamsExtServicesAE.xml</analysisEngine>
-  <analyzeFields merge="false">text,title</analyzeFields>
-  <fieldMapping>
-    <type name="org.apache.uima.jcas.tcas.Annotation">
-      <map feature="coveredText" field="tag"/>
-    </type>
-  </fieldMapping>
-</uimaConfig>
\ No newline at end of file
+  <updateRequestProcessorChain name="uima">
+    <processor class="org.apache.solr.uima.processor.UIMAUpdateRequestProcessorFactory">
+      <lst name="uimaConfig">
+        <lst name="runtimeParameters">
+          <str name="keyword_apikey">VALID_ALCHEMYAPI_KEY</str>
+          <str name="concept_apikey">VALID_ALCHEMYAPI_KEY</str>
+          <str name="lang_apikey">VALID_ALCHEMYAPI_KEY</str>
+          <str name="cat_apikey">VALID_ALCHEMYAPI_KEY</str>
+          <str name="entities_apikey">VALID_ALCHEMYAPI_KEY</str>
+          <str name="oc_licenseID">VALID_OPENCALAIS_KEY</str>
+        </lst>
+        <str name="analysisEngine">/org/apache/uima/desc/OverridingParamsExtServicesAE.xml</str>
+        <lst name="analyzeFields">
+          <bool name="merge">false</bool>
+          <arr name="fields">
+            <str>text</str>
+            <str>title</str>
+          </arr>
+        </lst>
+        <lst name="fieldMappings">
+          <lst name="mapping">
+            <str name="type">org.apache.uima.jcas.tcas.Annotation</str>
+            <str name="feature">convertText</str>
+            <str name="field">tag</str>
+          </lst>
+        </lst>
+      </lst>
+    </processor>
+    <processor class="solr.LogUpdateProcessorFactory" />
+    <processor class="solr.RunUpdateProcessorFactory" />
+  </updateRequestProcessorChain>

Modified: lucene/dev/branches/solr2452/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorTest.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorTest.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorTest.java Sun May 22 21:45:19 2011
@@ -33,6 +33,8 @@ import org.apache.solr.core.SolrCore;
 import org.apache.solr.handler.XmlUpdateRequestHandler;
 import org.apache.solr.request.SolrQueryRequestBase;
 import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.uima.processor.SolrUIMAConfiguration.MapField;
+import org.apache.solr.update.processor.UpdateRequestProcessor;
 import org.apache.solr.update.processor.UpdateRequestProcessorChain;
 import org.junit.Before;
 import org.junit.BeforeClass;
@@ -66,12 +68,32 @@ public class UIMAUpdateRequestProcessorT
     UIMAUpdateRequestProcessorFactory factory = (UIMAUpdateRequestProcessorFactory) chained
             .getFactories()[0];
     assertNotNull(factory);
+    UpdateRequestProcessor processor = factory.getInstance(req(), null, null);
+    assertTrue(processor instanceof UIMAUpdateRequestProcessor);
+  }
+
+  @Test
+  public void testMultiMap() {
+    SolrCore core = h.getCore();
+    UpdateRequestProcessorChain chained = core.getUpdateProcessingChain("uima-multi-map");
+    assertNotNull(chained);
+    UIMAUpdateRequestProcessorFactory factory = (UIMAUpdateRequestProcessorFactory) chained
+            .getFactories()[0];
+    assertNotNull(factory);
+    UpdateRequestProcessor processor = factory.getInstance(req(), null, null);
+    assertTrue(processor instanceof UIMAUpdateRequestProcessor);
+    SolrUIMAConfiguration conf = ((UIMAUpdateRequestProcessor)processor).solrUIMAConfiguration;
+    Map<String, Map<String, MapField>> map = conf.getTypesFeaturesFieldsMapping();
+    Map<String, MapField> subMap = map.get("a-type-which-can-have-multiple-features");
+    assertEquals(2, subMap.size());
+    assertEquals("1", subMap.get("A").getFieldName(null));
+    assertEquals("2", subMap.get("B").getFieldName(null));
   }
 
   @Test
   public void testProcessing() throws Exception {
 
-    addDoc(adoc(
+    addDoc("uima", adoc(
             "id",
             "2312312321312",
             "text",
@@ -83,19 +105,19 @@ public class UIMAUpdateRequestProcessorT
     assertU(commit());
     assertQ(req("sentence:*"), "//*[@numFound='1']");
     assertQ(req("sentiment:*"), "//*[@numFound='0']");
-    assertQ(req("entity:Prague"), "//*[@numFound='1']");
+    assertQ(req("OTHER_sm:Prague"), "//*[@numFound='1']");
   }
 
   @Test
   public void testTwoUpdates() throws Exception {
 
-    addDoc(adoc("id", "1", "text", "The Apache Software Foundation is happy to announce "
+    addDoc("uima", adoc("id", "1", "text", "The Apache Software Foundation is happy to announce "
             + "BarCampApache Sydney, Australia, the first ASF-backed event in the Southern "
             + "Hemisphere!"));
     assertU(commit());
     assertQ(req("sentence:*"), "//*[@numFound='1']");
 
-    addDoc(adoc("id", "2", "text", "Taking place 11th December 2010 at the University "
+    addDoc("uima", adoc("id", "2", "text", "Taking place 11th December 2010 at the University "
             + "of Sydney's Darlington Centre, the BarCampApache \"unconference\" will be"
             + " attendee-driven, facilitated by members of the Apache community and will "
             + "focus on the Apache..."));
@@ -103,12 +125,44 @@ public class UIMAUpdateRequestProcessorT
     assertQ(req("sentence:*"), "//*[@numFound='2']");
 
     assertQ(req("sentiment:positive"), "//*[@numFound='1']");
-    assertQ(req("entity:Apache"), "//*[@numFound='2']");
+    assertQ(req("ORGANIZATION_sm:Apache"), "//*[@numFound='2']");
+  }
+
+  @Test
+  public void testErrorHandling() throws Exception {
+
+    try{
+      addDoc("uima-not-ignoreErrors", adoc(
+            "id",
+            "2312312321312",
+            "text",
+            "SpellCheckComponent got improvement related to recent Lucene changes. \n  "
+                    + "Add support for specifying Spelling SuggestWord Comparator to Lucene spell "
+                    + "checkers for SpellCheckComponent. Issue SOLR-2053 is already fixed, patch is"
+                    + " attached if you need it, but it is also committed to trunk and 3_x branch."
+                    + " Last Lucene European Conference has been held in Prague."));
+      fail("exception shouldn't be ignored");
+    }
+    catch(RuntimeException expected){}
+    assertU(commit());
+    assertQ(req("*:*"), "//*[@numFound='0']");
+
+    addDoc("uima-ignoreErrors", adoc(
+            "id",
+            "2312312321312",
+            "text",
+            "SpellCheckComponent got improvement related to recent Lucene changes. \n  "
+                    + "Add support for specifying Spelling SuggestWord Comparator to Lucene spell "
+                    + "checkers for SpellCheckComponent. Issue SOLR-2053 is already fixed, patch is"
+                    + " attached if you need it, but it is also committed to trunk and 3_x branch."
+                    + " Last Lucene European Conference has been held in Prague."));
+    assertU(commit());
+    assertQ(req("*:*"), "//*[@numFound='1']");
   }
 
-  private void addDoc(String doc) throws Exception {
+  private void addDoc(String chain, String doc) throws Exception {
     Map<String, String[]> params = new HashMap<String, String[]>();
-    params.put(UpdateParams.UPDATE_PROCESSOR, new String[] { "uima" });
+    params.put(UpdateParams.UPDATE_CHAIN, new String[] { chain });
     MultiMapSolrParams mmparams = new MultiMapSolrParams(params);
     SolrQueryRequestBase req = new SolrQueryRequestBase(h.getCore(), (SolrParams) mmparams) {
     };

Modified: lucene/dev/branches/solr2452/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/an/DummyEntityAnnotator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/an/DummyEntityAnnotator.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/an/DummyEntityAnnotator.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/an/DummyEntityAnnotator.java Sun May 22 21:45:19 2011
@@ -34,6 +34,12 @@ public class DummyEntityAnnotator extend
         EntityAnnotation entityAnnotation = new EntityAnnotation(jcas);
         entityAnnotation.setBegin(annotation.getBegin());
         entityAnnotation.setEnd(annotation.getEnd());
+        String entityString = annotation.getCoveredText();
+        entityAnnotation.setEntity(entityString);
+        String name = "OTHER"; // "OTHER" makes no sense. In practice, "PERSON", "COUNTRY", "E-MAIL", etc.
+        if(entityString.equals("Apache"))
+          name = "ORGANIZATION";
+        entityAnnotation.setName(name);
         entityAnnotation.addToIndexes();
       }
     }