You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2011/05/18 18:24:34 UTC
svn commit: r1124321 [5/5] - in /lucene/dev/branches/docvalues: ./ dev-tools/eclipse/ dev-tools/idea/ dev-tools/idea/.idea/ dev-tools/idea/lucene/ dev-tools/idea/lucene/contrib/ant/ dev-tools/idea/lucene/contrib/db/bdb-je/ dev-tools/idea/lucene/contrib...

Modified: lucene/dev/branches/docvalues/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java (original)
+++ lucene/dev/branches/docvalues/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java Wed May 18 16:24:27 2011
@@ -18,9 +18,11 @@ package org.apache.solr.handler.clusteri
  */
 
 import java.io.IOException;
+import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@@ -37,6 +39,7 @@ import org.apache.solr.common.params.Sol
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.SimpleOrderedMap;
 import org.apache.solr.core.SolrCore;
+import org.apache.solr.core.SolrResourceLoader;
 import org.apache.solr.handler.clustering.SearchClusteringEngine;
 import org.apache.solr.handler.component.HighlightComponent;
 import org.apache.solr.highlight.SolrHighlighter;
@@ -52,9 +55,17 @@ import org.carrot2.core.ControllerFactor
 import org.carrot2.core.Document;
 import org.carrot2.core.IClusteringAlgorithm;
 import org.carrot2.core.attribute.AttributeNames;
+import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
+import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
+import org.carrot2.util.resource.ClassLoaderLocator;
+import org.carrot2.util.resource.IResource;
+import org.carrot2.util.resource.IResourceLocator;
+import org.carrot2.util.resource.ResourceLookup;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
 
 /**
@@ -64,19 +75,33 @@ import com.google.common.collect.Sets;
  *
  * @link http://project.carrot2.org
  */
-@SuppressWarnings("unchecked")
 public class CarrotClusteringEngine extends SearchClusteringEngine {
-  private transient static Logger log = LoggerFactory
+	private transient static Logger log = LoggerFactory
           .getLogger(CarrotClusteringEngine.class);
 
+	/**
+	 * The subdirectory in Solr config dir to read customized Carrot2 resources from.
+	 */
+	private static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
+
+  /**
+   * Name of Carrot2 document's field containing Solr document's identifier.
+   */
+  private static final String SOLR_DOCUMENT_ID = "solrId";
+
+  /**
+   * Name of Solr document's field containing the document's identifier. To avoid
+   * repeating the content of documents in clusters on output, each cluster contains
+   * identifiers of documents it contains.
+   */
+  private String idFieldName;
+
   /**
    * Carrot2 controller that manages instances of clustering algorithms
    */
   private Controller controller = ControllerFactory.createPooling();
   private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
 
-  private String idFieldName;
-
   @Override
   @Deprecated
   public Object cluster(Query query, DocList docList, SolrQueryRequest sreq) {
@@ -101,6 +126,10 @@ public class CarrotClusteringEngine exte
       attributes.put(AttributeNames.DOCUMENTS, documents);
       attributes.put(AttributeNames.QUERY, query.toString());
 
+      // Pass the fields on which clustering runs to the
+      // SolrStopwordsCarrot2LexicalDataFactory
+      attributes.put("solrFieldNames", getFieldsForClustering(sreq));
+
       // Pass extra overriding attributes from the request, if any
       extractCarrotAttributes(sreq.getParams(), attributes);
 
@@ -113,22 +142,68 @@ public class CarrotClusteringEngine exte
     }
   }
 
-  @Override
+	@Override
+	@SuppressWarnings({ "unchecked", "rawtypes" })
   public String init(NamedList config, final SolrCore core) {
     String result = super.init(config, core);
-    SolrParams initParams = SolrParams.toSolrParams(config);
+    final SolrParams initParams = SolrParams.toSolrParams(config);
 
     // Initialize Carrot2 controller. Pass initialization attributes, if any.
     HashMap<String, Object> initAttributes = new HashMap<String, Object>();
     extractCarrotAttributes(initParams, initAttributes);
-    
-    // Customize the language model factory. The implementation we provide here
-    // is included in the code base of Solr, so that it's possible to refactor
-    // the Lucene APIs the factory relies on if needed.
-    initAttributes.put("PreprocessingPipeline.languageModelFactory",
-      LuceneLanguageModelFactory.class);
-    this.controller.init(initAttributes);
 
+    // Customize the stemmer and tokenizer factories. The implementations we provide here
+    // are included in the code base of Solr, so that it's possible to refactor
+    // the Lucene APIs the factories rely on if needed.
+    // Additionally, we set a custom lexical resource factory for Carrot2 that
+    // will use both Carrot2 default stop words as well as stop words from
+    // the StopFilter defined on the field.
+		BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes)
+				.stemmerFactory(LuceneCarrot2StemmerFactory.class)
+				.tokenizerFactory(LuceneCarrot2TokenizerFactory.class)
+				.lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
+
+		// Pass the schema to SolrStopwordsCarrot2LexicalDataFactory.
+		initAttributes.put("solrIndexSchema", core.getSchema());
+
+    // Customize Carrot2's resource lookup to first look for resources
+    // using Solr's resource loader. If that fails, try loading from the classpath.
+    DefaultLexicalDataFactoryDescriptor.attributeBuilder(initAttributes)
+        .resourceLookup(new ResourceLookup(new IResourceLocator() {
+          @Override
+          public IResource[] getAll(final String resource) {
+            final SolrResourceLoader resourceLoader = core.getResourceLoader();
+            final String carrot2ResourcesDir = resourceLoader.getConfigDir()
+                + initParams.get(CarrotParams.LEXICAL_RESOURCES_DIR, CARROT_RESOURCES_PREFIX);
+            try {
+              log.debug("Looking for " + resource + " in "
+                  + carrot2ResourcesDir);
+              final InputStream resourceStream = resourceLoader
+                  .openResource(carrot2ResourcesDir + "/" + resource);
+
+              log.info(resource + " loaded from " + carrot2ResourcesDir);
+              final IResource foundResource = new IResource() {
+                @Override
+                public InputStream open() throws IOException {
+                  return resourceStream;
+                }
+              };
+              return new IResource[] { foundResource };
+            } catch (RuntimeException e) {
+              // No way to distinguish if the resource was found but failed
+              // to load or wasn't found at all, so we simply fall back
+              // to Carrot2 defaults here by returning an empty locations array.
+              log.debug(resource + " not found in " + carrot2ResourcesDir
+                  + ". Using the default " + resource + " from Carrot JAR.");
+              return new IResource[] {};
+            }
+          }
+        },
+
+        // Using the class loader directly because this time we want to omit the prefix
+        new ClassLoaderLocator(core.getResourceLoader().getClassLoader())));
+
+    this.controller.init(initAttributes);
     this.idFieldName = core.getSchema().getUniqueKeyField().getName();
 
     // Make sure the requested Carrot2 clustering algorithm class is available
@@ -148,17 +223,29 @@ public class CarrotClusteringEngine exte
   protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){
     SolrParams solrParams = sreq.getParams();
 
-    // Names of fields to deliver content for clustering
-    String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
+    HashSet<String> fields = Sets.newHashSet(getFieldsForClustering(sreq));
+    fields.add(idFieldName);
+    fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url"));
+		return fields;
+  }
+
+	/**
+	 * Returns the names of fields that will be delivering the actual
+	 * content for clustering. Currently, there are two such fields: document
+	 * title and document content.
+	 */
+	private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
+    SolrParams solrParams = sreq.getParams();
+
     String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
     String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
     if (StringUtils.isBlank(snippetField)) {
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
               + " must not be blank.");
     }
-    return Sets.newHashSet(urlField, titleField, snippetField, idFieldName);
-  }
-  
+    return Sets.newHashSet(titleField, snippetField);
+	}
+
   /**
    * Prepares Carrot2 documents for clustering.
    */
@@ -180,7 +267,7 @@ public class CarrotClusteringEngine exte
     if (produceSummary == true) {
       highlighter = HighlightComponent.getHighlighter(core);
       if (highlighter != null){
-        Map args = new HashMap();
+        Map<String, Object> args = Maps.newHashMap();
         snippetFieldAry = new String[]{snippetField};
         args.put(HighlightParams.FIELDS, snippetFieldAry);
         args.put(HighlightParams.HIGHLIGHT, "true");
@@ -214,11 +301,12 @@ public class CarrotClusteringEngine exte
       if (produceSummary && docIds != null) {
         docsHolder[0] = docIds.get(sdoc).intValue();
         DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
-        NamedList highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
+        NamedList<Object> highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
         if (highlights != null && highlights.size() == 1) {//should only be one value given our setup
           //should only be one document with one field
-          NamedList tmp = (NamedList) highlights.getVal(0);
-          String [] highlt = (String[]) tmp.get(snippetField);
+          @SuppressWarnings("unchecked")
+					NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
+          String [] highlt = tmp.get(snippetField);
           if (highlt != null && highlt.length == 1) {
             snippet = highlt[0];
           }
@@ -226,27 +314,13 @@ public class CarrotClusteringEngine exte
       }
       Document carrotDocument = new Document(getValue(sdoc, titleField),
               snippet, (String)sdoc.getFieldValue(urlField));
-      carrotDocument.setField("solrId", sdoc.getFieldValue(idFieldName));
+      carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
       result.add(carrotDocument);
     }
 
     return result;
   }
 
-  @Deprecated
-  protected String getValue(org.apache.lucene.document.Document doc,
-                            String field) {
-    StringBuilder result = new StringBuilder();
-    String[] vals = doc.getValues(field);
-    for (int i = 0; i < vals.length; i++) {
-      // Join multiple values with a period so that Carrot2 does not pick up
-      // phrases that cross field value boundaries (in most cases it would
-      // create useless phrases).
-      result.append(vals[i]).append(" . ");
-    }
-    return result.toString().trim();
-  }
-
   protected String getValue(SolrDocument sdoc, String field) {
     StringBuilder result = new StringBuilder();
     Collection<Object> vals = sdoc.getFieldValues(field);
@@ -261,9 +335,9 @@ public class CarrotClusteringEngine exte
     return result.toString().trim();
   }
 
-  private List clustersToNamedList(List<Cluster> carrotClusters,
+  private List<NamedList<Object>> clustersToNamedList(List<Cluster> carrotClusters,
                                    SolrParams solrParams) {
-    List result = new ArrayList();
+    List<NamedList<Object>> result = Lists.newArrayList();
     clustersToNamedList(carrotClusters, result, solrParams.getBool(
             CarrotParams.OUTPUT_SUB_CLUSTERS, true), solrParams.getInt(
             CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE));
@@ -271,25 +345,40 @@ public class CarrotClusteringEngine exte
   }
 
   private void clustersToNamedList(List<Cluster> outputClusters,
-                                   List parent, boolean outputSubClusters, int maxLabels) {
+                                   List<NamedList<Object>> parent, boolean outputSubClusters, int maxLabels) {
     for (Cluster outCluster : outputClusters) {
-      NamedList cluster = new SimpleOrderedMap();
+      NamedList<Object> cluster = new SimpleOrderedMap<Object>();
       parent.add(cluster);
 
+      // Add labels
       List<String> labels = outCluster.getPhrases();
-      if (labels.size() > maxLabels)
+      if (labels.size() > maxLabels) {
         labels = labels.subList(0, maxLabels);
+      }
       cluster.add("labels", labels);
 
+      // Add cluster score
+      final Double score = outCluster.getScore();
+      if (score != null) {
+        cluster.add("score", score);
+      }
+
+      // Add other topics marker
+      if (outCluster.isOtherTopics()) {
+        cluster.add("other-topics", outCluster.isOtherTopics());
+      }
+
+      // Add documents
       List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
-      List docList = new ArrayList();
+      List<Object> docList = Lists.newArrayList();
       cluster.add("docs", docList);
       for (Document doc : docs) {
-        docList.add(doc.getField("solrId"));
+        docList.add(doc.getField(SOLR_DOCUMENT_ID));
       }
 
-      if (outputSubClusters) {
-        List subclusters = new ArrayList();
+      // Add subclusters
+      if (outputSubClusters && !outCluster.getSubclusters().isEmpty()) {
+        List<NamedList<Object>> subclusters = Lists.newArrayList();
         cluster.add("clusters", subclusters);
         clustersToNamedList(outCluster.getSubclusters(), subclusters,
                 outputSubClusters, maxLabels);

Modified: lucene/dev/branches/docvalues/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java (original)
+++ lucene/dev/branches/docvalues/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java Wed May 18 16:24:27 2011
@@ -35,6 +35,8 @@ public interface CarrotParams {
   String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
   String SUMMARY_FRAGSIZE = CARROT_PREFIX + "fragzise";
 
+  String LEXICAL_RESOURCES_DIR = CARROT_PREFIX + "lexicalResourcesDir";
+
   public static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of(
           ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME,
           PRODUCE_SUMMARY, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS, SUMMARY_FRAGSIZE);

Modified: lucene/dev/branches/docvalues/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java (original)
+++ lucene/dev/branches/docvalues/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java Wed May 18 16:24:27 2011
@@ -17,6 +17,11 @@ package org.apache.solr.handler.clusteri
  * limitations under the License.
  */
 
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.Query;
@@ -37,15 +42,11 @@ import org.apache.solr.util.SolrPluginUt
 import org.carrot2.util.attribute.AttributeUtils;
 import org.junit.Test;
 
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import com.google.common.collect.ImmutableList;
 
 /**
  *
  */
-@SuppressWarnings("unchecked")
 public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
   @Test
   public void testCarrotLingo() throws Exception {
@@ -74,7 +75,7 @@ public class CarrotClusteringEngineTest 
 
   @Test
   public void testWithoutSubclusters() throws Exception {
-    checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs),
+    checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs),
             1, 1, 0);
   }
 
@@ -82,7 +83,7 @@ public class CarrotClusteringEngineTest 
   public void testWithSubclusters() throws Exception {
     ModifiableSolrParams params = new ModifiableSolrParams();
     params.set(CarrotParams.OUTPUT_SUB_CLUSTERS, true);
-    checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs), 1, 1, 2);
+    checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs), 1, 1, 2);
   }
 
   @Test
@@ -90,19 +91,107 @@ public class CarrotClusteringEngineTest 
     ModifiableSolrParams params = new ModifiableSolrParams();
     params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 5);
     params.set(CarrotParams.NUM_DESCRIPTIONS, 3);
-    checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
+    checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
             params), 1, 3, 0);
   }
 
   @Test
+  public void testClusterScores() throws Exception {
+    ModifiableSolrParams params = new ModifiableSolrParams();
+    params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
+    List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
+        AbstractClusteringTestCase.numberOfDocs, params);
+    int i = 1;
+    for (NamedList<Object> cluster : clusters) {
+      final Double score = getScore(cluster);
+      assertNotNull(score);
+      assertEquals(0.25 * i++, score, 0);
+    }
+  }
+
+  @Test
+  public void testOtherTopics() throws Exception {
+    ModifiableSolrParams params = new ModifiableSolrParams();
+    params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
+    params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "otherTopicsModulo"), 2);
+    List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
+        AbstractClusteringTestCase.numberOfDocs, params);
+    int i = 1;
+    for (NamedList<Object> cluster : clusters) {
+      assertEquals(i++ % 2 == 0 ? true : null, isOtherTopics(cluster));
+    }
+  }
+
+  @Test
   public void testCarrotAttributePassing() throws Exception {
     ModifiableSolrParams params = new ModifiableSolrParams();
     params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
     params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 3);
-    checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
+    checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
             params), 1, 3, 0);
   }
 
+	@Test
+	public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception {
+		checkLexicalResourcesFromSolrConfig("lexical-resource-check",
+				"online,customsolrstopword,customsolrstoplabel");
+	}
+
+	@Test
+	public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception {
+		checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir",
+				"online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
+	}
+
+	private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
+			throws IOException {
+		ModifiableSolrParams params = new ModifiableSolrParams();
+		params.set("merge-resources", false);
+		params.set(AttributeUtils.getKey(
+				LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+				wordsToCheck);
+
+		// "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in
+		// stoplabels.en, so we're expecting only one cluster with label "online".
+		final List<NamedList<Object>> clusters = checkEngine(
+				getClusteringEngine(engineName), 1, params);
+		assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
+	}
+
+	@Test
+	public void solrStopWordsUsedInCarrot2Clustering() throws Exception {
+		ModifiableSolrParams params = new ModifiableSolrParams();
+		params.set("merge-resources", false);
+		params.set(AttributeUtils.getKey(
+				LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+		"online,solrownstopword");
+
+		// "solrownstopword" is in stopwords.txt, so we're expecting
+		// only one cluster with label "online".
+		final List<NamedList<Object>> clusters = checkEngine(
+				getClusteringEngine("lexical-resource-check"), 1, params);
+		assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
+	}
+
+	@Test
+	public void solrStopWordsNotDefinedOnAFieldForClustering() throws Exception {
+		ModifiableSolrParams params = new ModifiableSolrParams();
+		// Force string fields to be used for clustering. Does not make sense
+		// in a real word, but does the job in the test.
+		params.set(CarrotParams.TITLE_FIELD_NAME, "url");
+		params.set(CarrotParams.SNIPPET_FIELD_NAME, "url");
+		params.set("merge-resources", false);
+		params.set(AttributeUtils.getKey(
+				LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+		"online,solrownstopword");
+
+		final List<NamedList<Object>> clusters = checkEngine(
+				getClusteringEngine("lexical-resource-check"), 2, params);
+		assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0)));
+		assertEquals(ImmutableList.of("solrownstopword"),
+				getLabels(clusters.get(1)));
+	}
+
   private CarrotClusteringEngine getClusteringEngine(String engineName) {
     ClusteringComponent comp = (ClusteringComponent) h.getCore()
             .getSearchComponent("clustering");
@@ -114,18 +203,18 @@ public class CarrotClusteringEngineTest 
     return engine;
   }
 
-  private List checkEngine(CarrotClusteringEngine engine,
+  private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
                             int expectedNumClusters) throws IOException {
     return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), new ModifiableSolrParams());
   }
 
-  private List checkEngine(CarrotClusteringEngine engine,
+  private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
                             int expectedNumClusters, SolrParams clusteringParams) throws IOException {
     return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), clusteringParams);
   }
 
 
-  private List checkEngine(CarrotClusteringEngine engine, int expectedNumDocs,
+  private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine, int expectedNumDocs,
                            int expectedNumClusters, Query query, SolrParams clusteringParams) throws IOException {
     // Get all documents to cluster
     RefCounted<SolrIndexSearcher> ref = h.getCore().getSearcher();
@@ -145,7 +234,9 @@ public class CarrotClusteringEngineTest 
       LocalSolrQueryRequest req = new LocalSolrQueryRequest(h.getCore(), solrParams);
       Map<SolrDocument,Integer> docIds = new HashMap<SolrDocument, Integer>(docList.size());
       SolrDocumentList solrDocList = SolrPluginUtils.docListToSolrDocumentList( docList, searcher, engine.getFieldsToLoad(req), docIds );
-      List results = (List)engine.cluster(query, solrDocList, docIds, req);
+
+      @SuppressWarnings("unchecked")
+			List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req);
       req.close();
       assertEquals("number of clusters: " + results, expectedNumClusters, results.size());
       checkClusters(results, false);
@@ -155,51 +246,74 @@ public class CarrotClusteringEngineTest 
     }
   }
 
-  private void checkClusters(List results, int expectedDocCount,
+  private void checkClusters(List<NamedList<Object>> results, int expectedDocCount,
                              int expectedLabelCount, int expectedSubclusterCount) {
     for (int i = 0; i < results.size(); i++) {
-      NamedList cluster = (NamedList) results.get(i);
+      NamedList<Object> cluster = results.get(i);
       checkCluster(cluster, expectedDocCount, expectedLabelCount,
               expectedSubclusterCount);
     }
   }
 
-  private void checkClusters(List results, boolean hasSubclusters) {
+  private void checkClusters(List<NamedList<Object>> results, boolean hasSubclusters) {
     for (int i = 0; i < results.size(); i++) {
-      checkCluster((NamedList) results.get(i), hasSubclusters);
+      checkCluster(results.get(i), hasSubclusters);
     }
   }
 
-  private void checkCluster(NamedList cluster, boolean hasSubclusters) {
-    List docs = (List) cluster.get("docs");
+  private void checkCluster(NamedList<Object> cluster, boolean hasSubclusters) {
+    List<Object> docs = getDocs(cluster);
     assertNotNull("docs is null and it shouldn't be", docs);
     for (int j = 0; j < docs.size(); j++) {
       String id = (String) docs.get(j);
       assertNotNull("id is null and it shouldn't be", id);
     }
 
-    List labels = (List) cluster.get("labels");
+    List<String> labels = getLabels(cluster);
     assertNotNull("labels is null but it shouldn't be", labels);
 
     if (hasSubclusters) {
-      List subclusters = (List) cluster.get("clusters");
+      List<NamedList<Object>> subclusters = getSubclusters(cluster);
       assertNotNull("subclusters is null but it shouldn't be", subclusters);
     }
   }
 
-  private void checkCluster(NamedList cluster, int expectedDocCount,
+  private void checkCluster(NamedList<Object> cluster, int expectedDocCount,
                             int expectedLabelCount, int expectedSubclusterCount) {
     checkCluster(cluster, expectedSubclusterCount > 0);
     assertEquals("number of docs in cluster", expectedDocCount,
-            ((List) cluster.get("docs")).size());
+            getDocs(cluster).size());
     assertEquals("number of labels in cluster", expectedLabelCount,
-            ((List) cluster.get("labels")).size());
+            getLabels(cluster).size());
 
     if (expectedSubclusterCount > 0) {
-      List subclusters = (List) cluster.get("clusters");
+      List<NamedList<Object>> subclusters = getSubclusters(cluster);
       assertEquals("numClusters", expectedSubclusterCount, subclusters.size());
       assertEquals("number of subclusters in cluster",
               expectedSubclusterCount, subclusters.size());
     }
   }
+
+	@SuppressWarnings("unchecked")
+	private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) {
+		return (List<NamedList<Object>>) cluster.get("clusters");
+	}
+
+	@SuppressWarnings("unchecked")
+	private List<String> getLabels(NamedList<Object> cluster) {
+		return (List<String>) cluster.get("labels");
+	}
+
+	private Double getScore(NamedList<Object> cluster) {
+	  return (Double) cluster.get("score");
+	}
+
+	private Boolean isOtherTopics(NamedList<Object> cluster) {
+	  return (Boolean)cluster.get("other-topics");
+	}
+
+	@SuppressWarnings("unchecked")
+	private List<Object> getDocs(NamedList<Object> cluster) {
+		return (List<Object>) cluster.get("docs");
+	}
 }

Modified: lucene/dev/branches/docvalues/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java (original)
+++ lucene/dev/branches/docvalues/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java Wed May 18 16:24:27 2011
@@ -49,6 +49,11 @@ public class MockClusteringAlgorithm ext
   @IntRange(min = 1, max = 5)
   private int labels = 1;
 
+  @Input
+  @Processing
+  @Attribute
+  private int otherTopicsModulo = 0;
+
   @Override
   public void process() throws ProcessingException {
     clusters = Lists.newArrayList();
@@ -59,21 +64,26 @@ public class MockClusteringAlgorithm ext
     int documentIndex = 1;
     for (Document document : documents) {
       StringBuilder label = new StringBuilder("Cluster " + documentIndex);
-      Cluster cluster = createCluster(label.toString(), document);
+      Cluster cluster = createCluster(label.toString(), documentIndex, document);
       clusters.add(cluster);
       for (int i = 1; i <= depth; i++) {
         label.append(".");
         label.append(i);
-        Cluster newCluster = createCluster(label.toString(), document);
-        cluster.addSubclusters(createCluster(label.toString(), document), newCluster);
+        Cluster newCluster = createCluster(label.toString(), documentIndex, document);
+        cluster.addSubclusters(createCluster(label.toString(), documentIndex, document), newCluster);
         cluster = newCluster;
       }
       documentIndex++;
     }
   }
 
-  private Cluster createCluster(String labelBase, Document... documents) {
+  private Cluster createCluster(String labelBase, int documentIndex, Document... documents) {
     Cluster cluster = new Cluster();
+    cluster.setScore(documentIndex * 0.25);
+    if (otherTopicsModulo != 0 && documentIndex % otherTopicsModulo == 0)
+    {
+      cluster.setOtherTopics(true);
+    }
     for (int i = 0; i < labels; i++) {
       cluster.addPhrases(labelBase + "#" + (i + 1));
     }

Modified: lucene/dev/branches/docvalues/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml (original)
+++ lucene/dev/branches/docvalues/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml Wed May 18 16:24:27 2011
@@ -396,6 +396,15 @@
       <str name="name">mock</str>
       <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
     </lst>
+    <lst name="engine">
+      <str name="name">lexical-resource-check</str>
+      <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
+    </lst>
+    <lst name="engine">
+      <str name="name">lexical-resource-check-custom-resource-dir</str>
+      <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
+      <str name="carrot.lexicalResourcesDir">clustering/custom</str>
+    </lst>
   </searchComponent>
 
   <searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="doc-clustering">

Modified: lucene/dev/branches/docvalues/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt (original)
+++ lucene/dev/branches/docvalues/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt Wed May 18 16:24:27 2011
@@ -55,4 +55,5 @@ to
 was
 will
 with
+solrownstopword
 

Modified: lucene/dev/branches/docvalues/solr/contrib/extraction/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/contrib/extraction/CHANGES.txt?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/contrib/extraction/CHANGES.txt (original)
+++ lucene/dev/branches/docvalues/solr/contrib/extraction/CHANGES.txt Wed May 18 16:24:27 2011
@@ -22,7 +22,7 @@ to your Solr Home lib directory.  See ht
 
 Current Version: Tika 0.8 (released 11/07/2010)
 
-$Id:$
+$Id$
 
 ================== Release 4.0-dev ==================
 
@@ -30,7 +30,8 @@ $Id:$
 
 ================== Release 3.2-dev ==================
 
-(No Changes)
+* SOLR-2480: Add ignoreTikaException flag so that users can ignore TikaException but index
+  meta data. (Shinichiro Abe, koji)
 
 ================== Release 3.1-dev ==================
 

Modified: lucene/dev/branches/docvalues/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java (original)
+++ lucene/dev/branches/docvalues/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java Wed May 18 16:24:27 2011
@@ -16,20 +16,27 @@
  */
 package org.apache.solr.handler.extraction;
 
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.Locale;
+
 import org.apache.commons.io.IOUtils;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.params.UpdateParams;
 import org.apache.solr.common.util.ContentStream;
 import org.apache.solr.common.util.NamedList;
+import org.apache.solr.handler.ContentStreamLoader;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.response.SolrQueryResponse;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.update.AddUpdateCommand;
 import org.apache.solr.update.processor.UpdateRequestProcessor;
-import org.apache.solr.handler.ContentStreamLoader;
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
@@ -37,26 +44,24 @@ import org.apache.tika.sax.XHTMLContentH
 import org.apache.tika.sax.xpath.Matcher;
 import org.apache.tika.sax.xpath.MatchingContentHandler;
 import org.apache.tika.sax.xpath.XPathParser;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.mime.MediaType;
-import org.apache.xml.serialize.OutputFormat;
 import org.apache.xml.serialize.BaseMarkupSerializer;
-import org.apache.xml.serialize.XMLSerializer;
+import org.apache.xml.serialize.OutputFormat;
 import org.apache.xml.serialize.TextSerializer;
+import org.apache.xml.serialize.XMLSerializer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.util.Locale;
-
 
 /**
  * The class responsible for loading extracted content into Solr.
  *
  **/
 public class ExtractingDocumentLoader extends ContentStreamLoader {
+
+  private static final Logger log = LoggerFactory.getLogger(ExtractingDocumentLoader.class);
+
   /**
    * Extract Only supported format
    */
@@ -74,6 +79,7 @@ public class ExtractingDocumentLoader ex
   final IndexSchema schema;
   final SolrParams params;
   final UpdateRequestProcessor processor;
+  final boolean ignoreTikaException;
   protected AutoDetectParser autoDetectParser;
 
   private final AddUpdateCommand templateAdd;
@@ -95,6 +101,8 @@ public class ExtractingDocumentLoader ex
     //this is lightweight
     autoDetectParser = new AutoDetectParser(config);
     this.factory = factory;
+    
+    ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false);
   }
 
 
@@ -180,9 +188,17 @@ public class ExtractingDocumentLoader ex
           parsingHandler = new MatchingContentHandler(handler, matcher);
         } //else leave it as is
 
-        //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
-        ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
-        parser.parse(inputStream, parsingHandler, metadata, context);
+        try{
+          //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
+          ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
+          parser.parse(inputStream, parsingHandler, metadata, context);
+        } catch (TikaException e) {
+          if(ignoreTikaException)
+            log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage())
+                .append(". metadata=").append(metadata.toString()).toString());
+          else
+            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+        }
         if (extractOnly == false) {
           addDoc(handler);
         } else {
@@ -202,8 +218,6 @@ public class ExtractingDocumentLoader ex
         }
       } catch (SAXException e) {
         throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
-      } catch (TikaException e) {
-        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
       } finally {
         IOUtils.closeQuietly(inputStream);
       }

Modified: lucene/dev/branches/docvalues/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java (original)
+++ lucene/dev/branches/docvalues/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java Wed May 18 16:24:27 2011
@@ -28,6 +28,11 @@ public interface ExtractingParams {
    */
   public static final String LOWERNAMES = "lowernames";
 
+  /**
+   * if true, ignore TikaException (give up to extract text but index meta data)
+   */
+  public static final String IGNORE_TIKA_EXCEPTION = "ignoreTikaException";
+
 
   /**
    * The param prefix for mapping Tika metadata to Solr fields.

Modified: lucene/dev/branches/docvalues/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java (original)
+++ lucene/dev/branches/docvalues/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java Wed May 18 16:24:27 2011
@@ -47,23 +47,23 @@ import java.util.*;
  */
 public class SolrContentHandler extends DefaultHandler implements ExtractingParams {
   private transient static Logger log = LoggerFactory.getLogger(SolrContentHandler.class);
-  private SolrInputDocument document;
+  protected SolrInputDocument document;
 
-  private Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
+  protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
 
-  private Metadata metadata;
-  private SolrParams params;
-  private StringBuilder catchAllBuilder = new StringBuilder(2048);
-  private IndexSchema schema;
-  private Map<String, StringBuilder> fieldBuilders = Collections.emptyMap();
+  protected Metadata metadata;
+  protected SolrParams params;
+  protected StringBuilder catchAllBuilder = new StringBuilder(2048);
+  protected IndexSchema schema;
+  protected Map<String, StringBuilder> fieldBuilders = Collections.emptyMap();
   private LinkedList<StringBuilder> bldrStack = new LinkedList<StringBuilder>();
 
-  private boolean captureAttribs;
-  private boolean lowerNames;
-  private String contentFieldName = "content";
+  protected boolean captureAttribs;
+  protected boolean lowerNames;
+  protected String contentFieldName = "content";
 
-  private String unknownFieldPrefix = "";
-  private String defaultField = "";
+  protected String unknownFieldPrefix = "";
+  protected String defaultField = "";
 
   public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
     this(metadata, params, schema, DateUtil.DEFAULT_DATE_FORMATS);
@@ -99,46 +99,82 @@ public class SolrContentHandler extends 
    * The base implementation adds the metadata as fields, allowing for potential remapping.
    *
    * @return The {@link org.apache.solr.common.SolrInputDocument}.
+   *
+   * @see #addMetadata()
+   * @see #addCapturedContent()
+   * @see #addContent()
+   * @see #addLiterals()
    */
   public SolrInputDocument newDocument() {
     float boost = 1.0f;
     //handle the metadata extracted from the document
-    for (String name : metadata.names()) {
-      String[] vals = metadata.getValues(name);
-      addField(name, null, vals);
-    }
+    addMetadata();
 
     //handle the literals from the params
-    Iterator<String> paramNames = params.getParameterNamesIterator();
-    while (paramNames.hasNext()) {
-      String pname = paramNames.next();
-      if (!pname.startsWith(LITERALS_PREFIX)) continue;
-
-      String name = pname.substring(LITERALS_PREFIX.length());
-      addField(name, null, params.getParams(pname));
-    }
+    addLiterals();
 
 
     //add in the content
-    addField(contentFieldName, catchAllBuilder.toString(), null);
+    addContent();
 
     //add in the captured content
+    addCapturedContent();
+
+    if (log.isDebugEnabled()) {
+      log.debug("Doc: {}", document);
+    }
+    return document;
+  }
+
+  /**
+   * Add the per field captured content to the Solr Document.  Default implementation uses the
+   * {@link #fieldBuilders} info
+   */
+  protected void addCapturedContent() {
     for (Map.Entry<String, StringBuilder> entry : fieldBuilders.entrySet()) {
       if (entry.getValue().length() > 0) {
         addField(entry.getKey(), entry.getValue().toString(), null);
       }
     }
-    if (log.isDebugEnabled()) {
-      log.debug("Doc: " + document);
+  }
+
+  /**
+   * Add in the catch all content to the field.  Default impl. uses the {@link #contentFieldName}
+   * and the {@link #catchAllBuilder}
+   */
+  protected void addContent() {
+    addField(contentFieldName, catchAllBuilder.toString(), null);
+  }
+
+  /**
+   * Add in the literals to the document using the {@link #params} and the {@link #LITERALS_PREFIX}.
+   */
+  protected void addLiterals() {
+    Iterator<String> paramNames = params.getParameterNamesIterator();
+    while (paramNames.hasNext()) {
+      String pname = paramNames.next();
+      if (!pname.startsWith(LITERALS_PREFIX)) continue;
+
+      String name = pname.substring(LITERALS_PREFIX.length());
+      addField(name, null, params.getParams(pname));
+    }
+  }
+
+  /**
+   * Add in any metadata using {@link #metadata} as the source.
+   */
+  protected void addMetadata() {
+    for (String name : metadata.names()) {
+      String[] vals = metadata.getValues(name);
+      addField(name, null, vals);
     }
-    return document;
   }
 
   // Naming rules:
   // 1) optionally map names to nicenames (lowercase+underscores)
   // 2) execute "map" commands
   // 3) if resulting field is unknown, map it to a common prefix
-  private void addField(String fname, String fval, String[] vals) {
+  protected void addField(String fname, String fval, String[] vals) {
     if (lowerNames) {
       StringBuilder sb = new StringBuilder();
       for (int i=0; i<fname.length(); i++) {

Modified: lucene/dev/branches/docvalues/solr/contrib/uima/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/contrib/uima/CHANGES.txt?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/contrib/uima/CHANGES.txt (original)
+++ lucene/dev/branches/docvalues/solr/contrib/uima/CHANGES.txt Wed May 18 16:24:27 2011
@@ -33,6 +33,9 @@ New Features
 
 * SOLR-2503: extend mapping function to map feature value to dynamicField. (koji)
 
+* SOLR-2512: add ignoreErrors flag so that users can ignore exceptions in AE.
+  (Tommaso Teofili, koji)
+
 Test Cases:
 ----------------------
 

Modified: lucene/dev/branches/docvalues/solr/contrib/uima/README.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/contrib/uima/README.txt?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/contrib/uima/README.txt (original)
+++ lucene/dev/branches/docvalues/solr/contrib/uima/README.txt Wed May 18 16:24:27 2011
@@ -30,6 +30,13 @@ To start using Solr UIMA Metadata Extrac
           <str name="oc_licenseID">VALID_OPENCALAIS_KEY</str>
         </lst>
         <str name="analysisEngine">/org/apache/uima/desc/OverridingParamsExtServicesAE.xml</str>
+        <!-- Set to true if you want to continue indexing even if text processing fails.
+             Default is false. That is, Solr throws RuntimeException and
+             never indexed documents entirely in your session. -->
+        <bool name="ignoreErrors">true</bool>
+        <!-- This is optional. It is used for logging when text processing fails.
+             Usually, set uniqueKey field name -->
+        <str name="logField">id</str>
         <lst name="analyzeFields">
           <bool name="merge">false</bool>
           <arr name="fields">

Modified: lucene/dev/branches/docvalues/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfiguration.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfiguration.java?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfiguration.java (original)
+++ lucene/dev/branches/docvalues/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfiguration.java Wed May 18 16:24:27 2011
@@ -36,14 +36,20 @@ public class SolrUIMAConfiguration {
 
   private Map<String, Object> runtimeParameters;
 
+  private boolean ignoreErrors;
+  
+  private String logField;
+
   public SolrUIMAConfiguration(String aePath, String[] fieldsToAnalyze, boolean fieldsMerging,
           Map<String, Map<String, MapField>> typesFeaturesFieldsMapping,
-          Map<String, Object> runtimeParameters) {
+          Map<String, Object> runtimeParameters, boolean ignoreErrors, String logField) {
     this.aePath = aePath;
     this.fieldsToAnalyze = fieldsToAnalyze;
     this.fieldsMerging = fieldsMerging;
     this.runtimeParameters = runtimeParameters;
     this.typesFeaturesFieldsMapping = typesFeaturesFieldsMapping;
+    this.ignoreErrors = ignoreErrors;
+    this.logField = logField;
   }
 
   public String[] getFieldsToAnalyze() {
@@ -65,6 +71,14 @@ public class SolrUIMAConfiguration {
   public Map<String, Object> getRuntimeParameters() {
     return runtimeParameters;
   }
+
+  public boolean isIgnoreErrors() {
+    return ignoreErrors;
+  }
+  
+  public String getLogField(){
+    return logField;
+  }
   
   static final class MapField {
     

Modified: lucene/dev/branches/docvalues/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfigurationReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfigurationReader.java?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfigurationReader.java (original)
+++ lucene/dev/branches/docvalues/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfigurationReader.java Wed May 18 16:24:27 2011
@@ -40,7 +40,8 @@ public class SolrUIMAConfigurationReader
 
   public SolrUIMAConfiguration readSolrUIMAConfiguration() {
     return new SolrUIMAConfiguration(readAEPath(), readFieldsToAnalyze(), readFieldsMerging(),
-            readTypesFeaturesFieldsMapping(), readAEOverridingParameters());
+            readTypesFeaturesFieldsMapping(), readAEOverridingParameters(), readIgnoreErrors(),
+            readLogField());
   }
 
   private String readAEPath() {
@@ -105,4 +106,12 @@ public class SolrUIMAConfigurationReader
     return runtimeParameters;
   }
 
+  private boolean readIgnoreErrors() {
+    Object ignoreErrors = args.get("ignoreErrors");
+    return ignoreErrors == null ? false : (Boolean)ignoreErrors;
+  }
+
+  private String readLogField() {
+    return (String)args.get("logField");
+  }
 }

Modified: lucene/dev/branches/docvalues/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessor.java?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessor.java (original)
+++ lucene/dev/branches/docvalues/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessor.java Wed May 18 16:24:27 2011
@@ -20,7 +20,9 @@ package org.apache.solr.uima.processor;
 import java.io.IOException;
 import java.util.Map;
 
+import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.uima.processor.SolrUIMAConfiguration.MapField;
 import org.apache.solr.uima.processor.ae.AEProvider;
@@ -58,12 +60,15 @@ public class UIMAUpdateRequestProcessor 
 
   @Override
   public void processAdd(AddUpdateCommand cmd) throws IOException {
+    String text = null;
     try {
       /* get Solr document */
       SolrInputDocument solrInputDocument = cmd.getSolrInputDocument();
 
       /* get the fields to analyze */
-      for (String text : getTextsToAnalyze(solrInputDocument)) {
+      String[] texts = getTextsToAnalyze(solrInputDocument);
+      for (int i = 0; i < texts.length; i++) {
+        text = texts[i];
         if (text != null && !"".equals(text)) {
           /* process the text value */
           JCas jcas = processText(text);
@@ -79,7 +84,21 @@ public class UIMAUpdateRequestProcessor 
         }
       }
     } catch (UIMAException e) {
-      throw new RuntimeException(e);
+      String logField = solrUIMAConfiguration.getLogField();
+      String optionalFieldInfo = logField == null ? "." :
+        new StringBuilder(". ").append(logField).append("=")
+        .append((String)cmd.getSolrInputDocument().getField(logField).getValue())
+        .append(", ").toString();
+      if (solrUIMAConfiguration.isIgnoreErrors())
+        log.warn(new StringBuilder("skip the text processing due to ")
+          .append(e.getLocalizedMessage()).append(optionalFieldInfo)
+          .append(" text=\"").append(text.substring(0, 100)).append("...\"").toString());
+      else{
+        throw new SolrException(ErrorCode.SERVER_ERROR,
+            new StringBuilder("processing error: ")
+              .append(e.getLocalizedMessage()).append(optionalFieldInfo)
+              .append(" text=\"").append(text.substring(0, 100)).append("...\"").toString(), e);
+      }
     }
     super.processAdd(cmd);
   }

Modified: lucene/dev/branches/docvalues/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorTest.java?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorTest.java (original)
+++ lucene/dev/branches/docvalues/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorTest.java Wed May 18 16:24:27 2011
@@ -93,7 +93,7 @@ public class UIMAUpdateRequestProcessorT
   @Test
   public void testProcessing() throws Exception {
 
-    addDoc(adoc(
+    addDoc("uima", adoc(
             "id",
             "2312312321312",
             "text",
@@ -111,13 +111,13 @@ public class UIMAUpdateRequestProcessorT
   @Test
   public void testTwoUpdates() throws Exception {
 
-    addDoc(adoc("id", "1", "text", "The Apache Software Foundation is happy to announce "
+    addDoc("uima", adoc("id", "1", "text", "The Apache Software Foundation is happy to announce "
             + "BarCampApache Sydney, Australia, the first ASF-backed event in the Southern "
             + "Hemisphere!"));
     assertU(commit());
     assertQ(req("sentence:*"), "//*[@numFound='1']");
 
-    addDoc(adoc("id", "2", "text", "Taking place 11th December 2010 at the University "
+    addDoc("uima", adoc("id", "2", "text", "Taking place 11th December 2010 at the University "
             + "of Sydney's Darlington Centre, the BarCampApache \"unconference\" will be"
             + " attendee-driven, facilitated by members of the Apache community and will "
             + "focus on the Apache..."));
@@ -128,9 +128,41 @@ public class UIMAUpdateRequestProcessorT
     assertQ(req("ORGANIZATION_sm:Apache"), "//*[@numFound='2']");
   }
 
-  private void addDoc(String doc) throws Exception {
+  @Test
+  public void testErrorHandling() throws Exception {
+
+    try{
+      addDoc("uima-not-ignoreErrors", adoc(
+            "id",
+            "2312312321312",
+            "text",
+            "SpellCheckComponent got improvement related to recent Lucene changes. \n  "
+                    + "Add support for specifying Spelling SuggestWord Comparator to Lucene spell "
+                    + "checkers for SpellCheckComponent. Issue SOLR-2053 is already fixed, patch is"
+                    + " attached if you need it, but it is also committed to trunk and 3_x branch."
+                    + " Last Lucene European Conference has been held in Prague."));
+      fail("exception shouldn't be ignored");
+    }
+    catch(RuntimeException expected){}
+    assertU(commit());
+    assertQ(req("*:*"), "//*[@numFound='0']");
+
+    addDoc("uima-ignoreErrors", adoc(
+            "id",
+            "2312312321312",
+            "text",
+            "SpellCheckComponent got improvement related to recent Lucene changes. \n  "
+                    + "Add support for specifying Spelling SuggestWord Comparator to Lucene spell "
+                    + "checkers for SpellCheckComponent. Issue SOLR-2053 is already fixed, patch is"
+                    + " attached if you need it, but it is also committed to trunk and 3_x branch."
+                    + " Last Lucene European Conference has been held in Prague."));
+    assertU(commit());
+    assertQ(req("*:*"), "//*[@numFound='1']");
+  }
+
+  private void addDoc(String chain, String doc) throws Exception {
     Map<String, String[]> params = new HashMap<String, String[]>();
-    params.put(UpdateParams.UPDATE_CHAIN, new String[] { "uima" });
+    params.put(UpdateParams.UPDATE_CHAIN, new String[] { chain });
     MultiMapSolrParams mmparams = new MultiMapSolrParams(params);
     SolrQueryRequestBase req = new SolrQueryRequestBase(h.getCore(), (SolrParams) mmparams) {
     };

Modified: lucene/dev/branches/docvalues/solr/contrib/uima/src/test/resources/solr-uima/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/contrib/uima/src/test/resources/solr-uima/conf/solrconfig.xml?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/contrib/uima/src/test/resources/solr-uima/conf/solrconfig.xml (original)
+++ lucene/dev/branches/docvalues/solr/contrib/uima/src/test/resources/solr-uima/conf/solrconfig.xml Wed May 18 16:24:27 2011
@@ -1003,7 +1003,6 @@
         </lst>
       </lst>
     </processor>
-    <processor class="solr.LogUpdateProcessorFactory" />
     <processor class="solr.RunUpdateProcessorFactory" />
   </updateRequestProcessorChain>
 
@@ -1037,6 +1036,48 @@
     </processor>
   </updateRequestProcessorChain>
 
+  <updateRequestProcessorChain name="uima-not-ignoreErrors">
+    <processor class="org.apache.solr.uima.processor.UIMAUpdateRequestProcessorFactory">
+      <lst name="uimaConfig">
+        <lst name="runtimeParameters">
+          <int name="ngramsize">3</int>
+        </lst>
+        <str name="analysisEngine">/TestExceptionAE.xml</str>
+        <bool name="ignoreErrors">false</bool>
+        <lst name="analyzeFields">
+          <bool name="merge">false</bool>
+          <arr name="fields">
+            <str>text</str>
+          </arr>
+        </lst>
+        <lst name="fieldMappings"/>
+      </lst>
+    </processor>
+    <processor class="solr.RunUpdateProcessorFactory" />
+  </updateRequestProcessorChain>
+
+  <updateRequestProcessorChain name="uima-ignoreErrors">
+    <processor class="org.apache.solr.uima.processor.UIMAUpdateRequestProcessorFactory">
+      <lst name="uimaConfig">
+        <lst name="runtimeParameters">
+          <int name="ngramsize">3</int>
+        </lst>
+        <str name="analysisEngine">/TestExceptionAE.xml</str>
+        <bool name="ignoreErrors">true</bool>
+        <!-- This is optional. It is used for logging when text processing fails. Usually, set uniqueKey field name -->
+        <str name="logField">id</str>
+        <lst name="analyzeFields">
+          <bool name="merge">false</bool>
+          <arr name="fields">
+            <str>text</str>
+          </arr>
+        </lst>
+        <lst name="fieldMappings"/>
+      </lst>
+    </processor>
+    <processor class="solr.RunUpdateProcessorFactory" />
+  </updateRequestProcessorChain>
+
   <!--
     queryResponseWriter plugins... query responses will be written using
     the writer specified by the 'wt' request parameter matching the name

Modified: lucene/dev/branches/docvalues/solr/example/solr/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/example/solr/conf/solrconfig.xml?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/example/solr/conf/solrconfig.xml (original)
+++ lucene/dev/branches/docvalues/solr/example/solr/conf/solrconfig.xml Wed May 18 16:24:27 2011
@@ -1198,17 +1198,20 @@
     <lst name="engine">
       <!-- The name, only one can be named "default" -->
       <str name="name">default</str>
-      <!-- Class name of Carrot2 clustering algorithm. 
-           
+
+      <!-- Class name of Carrot2 clustering algorithm.
+
            Currently available algorithms are:
            
            * org.carrot2.clustering.lingo.LingoClusteringAlgorithm
            * org.carrot2.clustering.stc.STCClusteringAlgorithm
+           * org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm
            
            See http://project.carrot2.org/algorithms.html for the
            algorithm's characteristics.
         -->
       <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
+
       <!-- Overriding values for Carrot2 default algorithm attributes.
 
            For a description of all available attributes, see:
@@ -1219,9 +1222,22 @@
            name and attribute value as parameter value.
         -->
       <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
-      
+
+      <!-- Location of Carrot2 lexical resources.
+
+           A directory from which to load Carrot2-specific stop words
+           and stop labels. Absolute or relative to Solr config directory.
+           If a specific resource (e.g. stopwords.en) is present in the
+           specified dir, it will completely override the corresponding
+           default one that ships with Carrot2.
+
+           For an overview of Carrot2 lexical resources, see:
+           http://download.carrot2.org/head/manual/#chapter.lexical-resources
+        -->
+      <str name="carrot.lexicalResourcesDir">clustering/carrot2</str>
+
       <!-- The language to assume for the documents.
-           
+
            For a list of allowed values, see:
            http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage
        -->

Modified: lucene/dev/branches/docvalues/solr/src/java/org/apache/solr/core/SolrConfig.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/src/java/org/apache/solr/core/SolrConfig.java?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/src/java/org/apache/solr/core/SolrConfig.java (original)
+++ lucene/dev/branches/docvalues/solr/src/java/org/apache/solr/core/SolrConfig.java Wed May 18 16:24:27 2011
@@ -57,7 +57,6 @@ import java.util.regex.Pattern;
 import java.util.regex.Matcher;
 import java.io.FileFilter;
 import java.io.IOException;
-import java.io.InputStream;
 
 
 /**
@@ -130,12 +129,12 @@ public class SolrConfig extends Config {
   throws ParserConfigurationException, IOException, SAXException {
     super(loader, name, is, "/config/");
     initLibs();
+    luceneMatchVersion = getLuceneVersion("luceneMatchVersion");
     defaultIndexConfig = new SolrIndexConfig(this, null, null);
     mainIndexConfig = new SolrIndexConfig(this, "mainIndex", defaultIndexConfig);
     reopenReaders = getBool("mainIndex/reopenReaders", true);
     
     booleanQueryMaxClauseCount = getInt("query/maxBooleanClauses", BooleanQuery.getMaxClauseCount());
-    luceneMatchVersion = getLuceneVersion("luceneMatchVersion");
     log.info("Using Lucene MatchVersion: " + luceneMatchVersion);
 
     filtOptEnabled = getBool("query/boolTofilterOptimizer/@enabled", false);

Modified: lucene/dev/branches/docvalues/solr/src/java/org/apache/solr/response/JSONResponseWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/src/java/org/apache/solr/response/JSONResponseWriter.java?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/src/java/org/apache/solr/response/JSONResponseWriter.java (original)
+++ lucene/dev/branches/docvalues/solr/src/java/org/apache/solr/response/JSONResponseWriter.java Wed May 18 16:24:27 2011
@@ -442,7 +442,7 @@ class JSONWriter extends TextResponseWri
 
       for (int i=0; i<val.length(); i++) {
         char ch = val.charAt(i);
-        if ((ch > '#' && ch != '\\' && ch !=  '\u2028') || ch==' ') { // fast path
+        if ((ch > '#' && ch != '\\' && ch < '\u2028') || ch == ' ') { // fast path
           writer.write(ch);
           continue;
         }
@@ -457,7 +457,10 @@ class JSONWriter extends TextResponseWri
           case '\t': writer.write('\\'); writer.write('t'); break;
           case '\b': writer.write('\\'); writer.write('b'); break;
           case '\f': writer.write('\\'); writer.write('f'); break;
-          case '\u2028': unicodeEscape(writer,ch); break;
+          case '\u2028': // fallthrough
+          case '\u2029':
+            unicodeEscape(writer,ch);
+            break;
           // case '/':
           default: {
             if (ch <= 0x1F) {

Modified: lucene/dev/branches/docvalues/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java (original)
+++ lucene/dev/branches/docvalues/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java Wed May 18 16:24:27 2011
@@ -457,7 +457,7 @@ class JoinQuery extends Query {
         return resultList.get(0);
       }
 
-      int sz = resultList.size();
+      int sz = 0;
 
       for (DocSet set : resultList)
         sz += set.size();

Modified: lucene/dev/branches/docvalues/solr/src/java/org/apache/solr/search/SolrIndexSearcher.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/src/java/org/apache/solr/search/SolrIndexSearcher.java?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/src/java/org/apache/solr/search/SolrIndexSearcher.java (original)
+++ lucene/dev/branches/docvalues/solr/src/java/org/apache/solr/search/SolrIndexSearcher.java Wed May 18 16:24:27 2011
@@ -811,7 +811,7 @@ public class SolrIndexSearcher extends I
       bitsSet += upto;
       result = new BitDocSet(obs, bitsSet);
     } else {
-      result = new SortedIntDocSet(Arrays.copyOf(docs, upto));
+      result = upto==0 ? DocSet.EMPTY : new SortedIntDocSet(Arrays.copyOf(docs, upto));
     }
 
     if (useCache) {

Modified: lucene/dev/branches/docvalues/solr/src/test-framework/org/apache/solr/JSONTestUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/src/test-framework/org/apache/solr/JSONTestUtil.java?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/src/test-framework/org/apache/solr/JSONTestUtil.java (original)
+++ lucene/dev/branches/docvalues/solr/src/test-framework/org/apache/solr/JSONTestUtil.java Wed May 18 16:24:27 2011
@@ -25,29 +25,69 @@ import java.util.*;
 
 public class JSONTestUtil {
 
+  /**
+   * Default delta used in numeric equality comparisons for floats and doubles.
+   */
+  public final static double DEFAULT_DELTA = 1e-5;
+
+  /** 
+   * comparison using default delta
+   * @see #DEFAULT_DELTA
+   * @see #match(String,String,double)
+   */
   public static String match(String input, String pathAndExpected) throws Exception {
+    return match(input, pathAndExpected, DEFAULT_DELTA);
+  }
+
+  /** 
+   * comparison using default delta
+   * @see #DEFAULT_DELTA
+   * @see #match(String,String,String,double)
+   */
+  public static String match(String path, String input, String expected) throws Exception {
+    return match(path, input, expected, DEFAULT_DELTA);
+  }
+
+  /**
+   * comparison using default delta
+   * @see #DEFAULT_DELTA
+   * @see #matchObj(String,Object,Object,double)
+   */
+  public static String matchObj(String path, Object input, Object expected) throws Exception {
+    return matchObj(path,input,expected, DEFAULT_DELTA);
+  }
+
+  /**
+   * @param input JSON Structure to parse and test against
+   * @param pathAndExpected JSON path expression + '==' + expected value
+   * @param delta tollerance allowed in comparing float/double values
+   */
+  public static String match(String input, String pathAndExpected, double delta) throws Exception {
     int pos = pathAndExpected.indexOf("==");
     String path = pos>=0 ? pathAndExpected.substring(0,pos) : null;
     String expected = pos>=0 ? pathAndExpected.substring(pos+2) : pathAndExpected;
-    return match(path, input, expected);
+    return match(path, input, expected, delta);
   }
 
-  public static String match(String path, String input, String expected) throws Exception {
+  /**
+   * @param path JSON path expression
+   * @param input JSON Structure to parse and test against
+   * @param expected expected value of path
+   * @param delta tollerance allowed in comparing float/double values
+   */
+  public static String match(String path, String input, String expected, double delta) throws Exception {
     Object inputObj = ObjectBuilder.fromJSON(input);
     Object expectObj = ObjectBuilder.fromJSON(expected);
     return matchObj(path, inputObj, expectObj);
   }
-
-  /**
-  public static Object fromJSON(String json) {
-    try {
-      Object out = ObjectBuilder.fromJSON(json);
-    } finally {
-
-  }
-  **/
   
-  public static String matchObj(String path, Object input, Object expected) throws Exception {
+  /**
+   * @param path JSON path expression
+   * @param input JSON Structure
+   * @param expected expected JSON Object
+   * @param delta tollerance allowed in comparing float/double values
+   */
+  public static String matchObj(String path, Object input, Object expected, double delta) throws Exception {
     CollectionTester tester = new CollectionTester(input);
     boolean reversed = path.startsWith("!");
     String positivePath = reversed ? path.substring(1) : path;
@@ -68,14 +108,19 @@ class CollectionTester {
   public Object val;
   public Object expectedRoot;
   public Object expected;
+  public double delta;
   public List<Object> path;
   public String err;
 
-  public CollectionTester(Object val) {
+  public CollectionTester(Object val, double delta) {
     this.val = val;
     this.valRoot = val;
+    this.delta = delta;
     path = new ArrayList<Object>();
   }
+  public CollectionTester(Object val) {
+    this(val, JSONTestUtil.DEFAULT_DELTA);
+  }
 
   public String getPath() {
     StringBuilder sb = new StringBuilder();
@@ -143,7 +188,7 @@ class CollectionTester {
         double a = ((Number)expected).doubleValue();
         double b = ((Number)val).doubleValue();
         if (Double.compare(a,b) == 0) return true;
-        if (Math.abs(a-b) < 1e-5) return true;
+        if (Math.abs(a-b) < delta) return true;
         return false;
       } else {
         setErr("mismatch: '" + expected + "'!='" + val + "'");

Modified: lucene/dev/branches/docvalues/solr/src/test-framework/org/apache/solr/SolrTestCaseJ4.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/src/test-framework/org/apache/solr/SolrTestCaseJ4.java?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/src/test-framework/org/apache/solr/SolrTestCaseJ4.java (original)
+++ lucene/dev/branches/docvalues/solr/src/test-framework/org/apache/solr/SolrTestCaseJ4.java Wed May 18 16:24:27 2011
@@ -36,8 +36,12 @@ import org.apache.solr.handler.JsonUpdat
 import org.apache.solr.request.LocalSolrQueryRequest;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.request.SolrRequestHandler;
+import org.apache.solr.response.ResultContext;
+import org.apache.solr.response.SolrQueryResponse;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.schema.SchemaField;
+import org.apache.solr.search.DocIterator;
+import org.apache.solr.search.DocList;
 import org.apache.solr.search.SolrIndexSearcher;
 import org.apache.solr.servlet.DirectSolrConnection;
 import org.apache.solr.util.TestHarness;
@@ -374,15 +378,29 @@ public abstract class SolrTestCaseJ4 ext
     }
   }
 
-  /** Validates a query matches some JSON test expressions and closes the query.
-   * The text expression is of the form path:JSON.  To facilitate easy embedding
-   * in Java strings, the JSON can have double quotes replaced with single quotes.
-   *
-   * Please use this with care: this makes it easy to match complete structures, but doing so
-   * can result in fragile tests if you are matching more than what you want to test.
-   *
-   **/
+  /** 
+   * Validates a query matches some JSON test expressions using the default double delta tollerance.
+   * @see JSONTestUtil#DEFAULT_DELTA
+   * @see #assertJQ(SolrQueryRequest,double,String...)
+   */
   public static void assertJQ(SolrQueryRequest req, String... tests) throws Exception {
+    assertJQ(req, JSONTestUtil.DEFAULT_DELTA, tests);
+  }
+  /** 
+   * Validates a query matches some JSON test expressions and closes the 
+   * query. The text expression is of the form path:JSON.  To facilitate 
+   * easy embedding in Java strings, the JSON can have double quotes 
+   * replaced with single quotes.
+   * <p>
+   * Please use this with care: this makes it easy to match complete 
+   * structures, but doing so can result in fragile tests if you are 
+   * matching more than what you want to test.
+   * </p>
+   * @param req Solr request to execute
+   * @param delta tollerance allowed in comparing float/double values
+   * @param tests JSON path expression + '==' + expected value
+   */
+  public static void assertJQ(SolrQueryRequest req, double delta, String... tests) throws Exception {
     SolrParams params =  null;
     try {
       params = req.getParams();
@@ -409,7 +427,7 @@ public abstract class SolrTestCaseJ4 ext
 
         try {
           failed = true;
-          String err = JSONTestUtil.match(response, testJSON);
+          String err = JSONTestUtil.match(response, testJSON, delta);
           failed = false;
           if (err != null) {
             log.error("query failed JSON validation. error=" + err +

Modified: lucene/dev/branches/docvalues/solr/src/test/org/apache/solr/TestJoin.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/src/test/org/apache/solr/TestJoin.java?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/src/test/org/apache/solr/TestJoin.java (original)
+++ lucene/dev/branches/docvalues/solr/src/test/org/apache/solr/TestJoin.java Wed May 18 16:24:27 2011
@@ -101,6 +101,14 @@ public class TestJoin extends SolrTestCa
     int indexIter=50 * RANDOM_MULTIPLIER;
     int queryIter=50 * RANDOM_MULTIPLIER;
 
+    // groups of fields that have any chance of matching... used to
+    // increase test effectiveness by avoiding 0 resultsets much of the time.
+    String[][] compat = new String[][] {
+        {"small_s","small2_s","small2_ss","small3_ss"},
+        {"small_i","small2_i","small2_is","small3_is"}
+    };
+
+
     while (--indexIter >= 0) {
       int indexSize = random.nextInt(20 * RANDOM_MULTIPLIER);
 
@@ -121,8 +129,19 @@ public class TestJoin extends SolrTestCa
       Map<String, Map<Comparable, Set<Comparable>>> pivots = new HashMap<String, Map<Comparable, Set<Comparable>>>();
 
       for (int qiter=0; qiter<queryIter; qiter++) {
-        String fromField = types.get(random.nextInt(types.size())).fname;
-        String toField = types.get(random.nextInt(types.size())).fname;
+        String fromField;
+        String toField;
+        if (random.nextInt(100) < 5) {
+          // pick random fields 5% of the time
+          fromField = types.get(random.nextInt(types.size())).fname;
+          // pick the same field 50% of the time we pick a random field (since other fields won't match anything)
+          toField = (random.nextInt(100) < 50) ? fromField : types.get(random.nextInt(types.size())).fname;
+        } else {
+          // otherwise, pick compatible fields that have a chance of matching indexed tokens
+          String[] group = compat[random.nextInt(compat.length)];
+          fromField = group[random.nextInt(group.length)];
+          toField = group[random.nextInt(group.length)];
+        }
 
         Map<Comparable, Set<Comparable>> pivot = pivots.get(fromField+"/"+toField);
         if (pivot == null) {
@@ -146,7 +165,7 @@ public class TestJoin extends SolrTestCa
         resultSet.put("start", 0);
         resultSet.put("docs", sortedDocs);
 
-        // todo: use filters
+        // todo: use different join queries for better coverage
 
         SolrQueryRequest req = req("wt","json","indent","true", "echoParams","all",
             "q","{!join from="+fromField+" to="+toField
@@ -159,7 +178,7 @@ public class TestJoin extends SolrTestCa
         Object realResponse = ObjectBuilder.fromJSON(strResponse);
         String err = JSONTestUtil.matchObj("/response", realResponse, resultSet);
         if (err != null) {
-          log.error("GROUPING MISMATCH: " + err
+          log.error("JOIN MISMATCH: " + err
            + "\n\trequest="+req
            + "\n\tresult="+strResponse
            + "\n\texpected="+ JSONUtil.toJSON(resultSet)

Modified: lucene/dev/branches/docvalues/solr/src/test/org/apache/solr/request/JSONWriterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/src/test/org/apache/solr/request/JSONWriterTest.java?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/src/test/org/apache/solr/request/JSONWriterTest.java (original)
+++ lucene/dev/branches/docvalues/solr/src/test/org/apache/solr/request/JSONWriterTest.java Wed May 18 16:24:27 2011
@@ -73,12 +73,12 @@ public class JSONWriterTest extends Solr
 
     StringWriter buf = new StringWriter();
     NamedList nl = new NamedList();
-    nl.add("data1", "hello");
+    nl.add("data1", "he\u2028llo\u2029!");       // make sure that 2028 and 2029 are both escaped (they are illegal in javascript)
     nl.add(null, 42);
     rsp.add("nl", nl);
 
     w.write(buf, req, rsp);
-    assertEquals(buf.toString(), "{\"nl\":[[\"data1\",\"hello\"],[null,42]]}");
+    assertEquals("{\"nl\":[[\"data1\",\"he\\u2028llo\\u2029!\"],[null,42]]}", buf.toString());
     req.close();
   }
   

Modified: lucene/dev/branches/docvalues/solr/src/test/org/apache/solr/search/function/distance/DistanceFunctionTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/solr/src/test/org/apache/solr/search/function/distance/DistanceFunctionTest.java?rev=1124321&r1=1124320&r2=1124321&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/solr/src/test/org/apache/solr/search/function/distance/DistanceFunctionTest.java (original)
+++ lucene/dev/branches/docvalues/solr/src/test/org/apache/solr/search/function/distance/DistanceFunctionTest.java Wed May 18 16:24:27 2011
@@ -76,39 +76,74 @@ public class DistanceFunctionTest extend
     assertU(adoc("id", "100", "store", "1,2"));
     assertU(commit());
    
-    assertJQ(req("defType","func", "q","geodist(1,2,3,4)","fq","id:100","fl","id,score")
-      ,"/response/docs/[0]/score==314.40338"
-    );
+    assertJQ(req("defType","func", 
+                 "q","geodist(1,2,3,4)",
+                 "fq","id:100",
+                 "fl","id,score")
+             , 1e-5
+             , "/response/docs/[0]/score==314.40338"
+             );
 
     // throw in some decimal points
-    assertJQ(req("defType","func", "q","geodist(1.0,2,3,4.0)","fq","id:100","fl","id,score")
-      ,"/response/docs/[0]/score==314.40338"
-    );
+    assertJQ(req("defType","func", 
+                 "q","geodist(1.0,2,3,4.0)",
+                 "fq","id:100",
+                 "fl","id,score")
+             , 1e-5
+             , "/response/docs/[0]/score==314.40338"
+             );
 
     // default to reading pt
-    assertJQ(req("defType","func", "q","geodist(1,2)","pt","3,4", "fq","id:100","fl","id,score")
-      ,"/response/docs/[0]/score==314.40338"
-    );
+    assertJQ(req("defType","func", 
+                 "q","geodist(1,2)",
+                 "pt","3,4", 
+                 "fq","id:100",
+                 "fl","id,score")
+             , 1e-5
+             , "/response/docs/[0]/score==314.40338"
+             );
 
     // default to reading pt first
-    assertJQ(req("defType","func", "q","geodist(1,2)","pt","3,4", "sfield","store", "fq","id:100","fl","id,score")
-      ,"/response/docs/[0]/score==314.40338"
-    );
+    assertJQ(req("defType","func", 
+                 "q","geodist(1,2)",
+                 "pt","3,4", 
+                 "sfield","store", 
+                 "fq","id:100",
+                 "fl","id,score")
+             , 1e-5
+             , "/response/docs/[0]/score==314.40338"
+             );
 
     // if pt missing, use sfield
-    assertJQ(req("defType","func", "q","geodist(3,4)","sfield","store", "fq","id:100","fl","id,score")
-      ,"/response/docs/[0]/score==314.40338"
-    );
-
+    assertJQ(req("defType","func", 
+                 "q","geodist(3,4)",
+                 "sfield","store", 
+                 "fq","id:100",
+                 "fl","id,score")
+             , 1e-5
+             ,"/response/docs/[0]/score==314.40338"
+             );
+    
     // read both pt and sfield
-    assertJQ(req("defType","func", "q","geodist()","pt","3,4","sfield","store", "fq","id:100","fl","id,score")
-      ,"/response/docs/[0]/score==314.40338"
-    );
+    assertJQ(req("defType","func", 
+                 "q","geodist()","pt","3,4",
+                 "sfield","store", 
+                 "fq","id:100",
+                 "fl","id,score")
+             , 1e-5
+             ,"/response/docs/[0]/score==314.40338"
+             );
 
     // param substitution
-    assertJQ(req("defType","func", "q","geodist($a,$b)","a","3,4","b","store", "fq","id:100","fl","id,score")
-      ,"/response/docs/[0]/score==314.40338"
-    );
+    assertJQ(req("defType","func", 
+                 "q","geodist($a,$b)",
+                 "a","3,4",
+                 "b","store", 
+                 "fq","id:100",
+                 "fl","id,score")
+             , 1e-5
+             ,"/response/docs/[0]/score==314.40338"
+             );
 
   }