You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2011/12/16 17:57:34 UTC

svn commit: r1215220 - in /lucene/dev/branches/solrcloud/solr/core/src: java/org/apache/solr/cloud/ java/org/apache/solr/update/ test/org/apache/solr/cloud/

Author: markrmiller
Date: Fri Dec 16 16:57:34 2011
New Revision: 1215220

URL: http://svn.apache.org/viewvc?rev=1215220&view=rev
Log:
clean up failure responses a bit

Modified:
    lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/RecoveryStrat.java
    lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java
    lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyDistributedZkTest.java
    lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullDistributedZkTest.java

Modified: lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/RecoveryStrat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/RecoveryStrat.java?rev=1215220&r1=1215219&r2=1215220&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/RecoveryStrat.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/RecoveryStrat.java Fri Dec 16 16:57:34 2011
@@ -63,7 +63,7 @@ public class RecoveryStrat {
   }
   
   public void close() {
-    close  = true;
+    close = true;
   }
   
   // TODO: we want to be pretty noisy if we don't properly recover?
@@ -102,6 +102,7 @@ public class RecoveryStrat {
             EmbeddedSolrServer server = new EmbeddedSolrServer(core);
             server.commit();
             
+            // nocommit: remove this
             RefCounted<SolrIndexSearcher> searcher = core.getSearcher(true,
                 true, null);
             System.out.println("DOCS AFTER REPLAY:"

Modified: lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java?rev=1215220&r1=1215219&r2=1215220&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java Fri Dec 16 16:57:34 2011
@@ -292,7 +292,7 @@ public class SolrCmdDistributor {
   void checkResponses(boolean block) {
     
     int expectedResponses = pending == null ? 0 : pending.size();
-    int failedAfterConnect = 0;
+    int nonConnectionErrors = 0;
     int failed = 0;
     while (pending != null && pending.size() > 0) {
       try {
@@ -312,9 +312,13 @@ public class SolrCmdDistributor {
             // the problem is there are other exceptions thrown due to a machine going down mid connection... I've
             // seen Interrupted exceptions.
             
+            // nocommit:
             // we have to match against the msg...:(
-            if (!e.getMessage().contains("java.net.ConnectException: Connection refused")) failedAfterConnect++;
-            
+            if (!e.getMessage().contains(
+                "java.net.ConnectException: Connection refused")
+                || e.getMessage().contains(
+                    "java.net.SocketException: Connection reset")) nonConnectionErrors++;
+
             failed++;
             // use the first exception encountered
             // TODO: perhaps we should do more?
@@ -348,13 +352,16 @@ public class SolrCmdDistributor {
             "interrupted waiting for shard update response", e);
       }
     }
-//    if (failed > 0) {
-//      System.out.println("expected:" + expectedResponses + " failed:" + failed + " failedAfterConnect:" + failedAfterConnect);
-//    }
+    if (failed > 0) {
+      System.out.println("expected:" + expectedResponses + " failed:" + failed + " failedAfterConnect:" + nonConnectionErrors);
+    }
     // TODO: this is a somewhat weak success guarantee - if the request was successful on every replica considered up
     // and that does not return a connect exception, it was successful.
     //should we optionally fail when there is only a single leader for a shard? (no replication)
-    if (failed <= failedAfterConnect && failed != expectedResponses) {
+    
+    // TODO: now we should tell those that failed to try and recover?
+    if (failed > 0 && nonConnectionErrors == 0) {
+      System.out.println("clear exception");
       rsp.setException(null);
     }
   }

Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyDistributedZkTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyDistributedZkTest.java?rev=1215220&r1=1215219&r2=1215220&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyDistributedZkTest.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyDistributedZkTest.java Fri Dec 16 16:57:34 2011
@@ -22,7 +22,7 @@ import org.junit.BeforeClass;
 
 /**
  * TODO: sometimes the shards are off by a doc or two, even with the
- * retries on index failure...
+ * retries on index failure...perhaps because of leader dying mid update?
  */
 public class ChaosMonkeyDistributedZkTest extends FullDistributedZkTest {
   
@@ -58,7 +58,7 @@ public class ChaosMonkeyDistributedZkTes
     indexThread.safeStop();
     
     // try and wait for any replications and what not to finish...
-    Thread.sleep(4000);
+    Thread.sleep(5000);
     
     commit();
     

Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullDistributedZkTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullDistributedZkTest.java?rev=1215220&r1=1215219&r2=1215220&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullDistributedZkTest.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullDistributedZkTest.java Fri Dec 16 16:57:34 2011
@@ -769,6 +769,7 @@ public class FullDistributedZkTest exten
     List<SolrServer> solrClients = shardToClient.get(shard);
     long num = -1;
     long lastNum = -1;
+    String failMessage = null;
     System.out.println("\n\ncheck const");
     for (SolrServer client : solrClients) {
       try {
@@ -777,13 +778,17 @@ public class FullDistributedZkTest exten
         if (e.getMessage().contains("Connection refused")) continue;
         throw e;
       }
-      System.out.println("num:" + num + "\n\n");
-      if (lastNum > -1 && lastNum != num) {
-        fail("shard is not consistent, expected:" + lastNum + " and got:" + num);
+      System.out.println(" num:" + num + "\n");
+      if (lastNum > -1 && lastNum != num && failMessage == null) {
+        failMessage = "shard is not consistent, expected:" + lastNum + " and got:" + num;
       }
       lastNum = num;
     }
     
+    if (failMessage != null) {
+      fail(failMessage);
+    }
+    
     // now check that the right # are on each shard
     long docs = controlClient.query(new SolrQuery("*:*")).getResults().getNumFound();
     Set<String> theShards = shardToClient.keySet();
@@ -919,6 +924,7 @@ public class FullDistributedZkTest exten
     @Override
     public void run() {
       int i = startI;
+      int fails = 0;
       boolean success = false;
       while (true && !stop) {
         success = false;
@@ -929,6 +935,7 @@ public class FullDistributedZkTest exten
                 "to come to the aid of their country.");
             success = true;
           } catch (Exception e) {
+            fails++;
             // on failure, we pause and repeat
             try {
               sleep(10);
@@ -939,7 +946,7 @@ public class FullDistributedZkTest exten
         }
       }
       
-      System.err.println("added docs:" + i);
+      System.err.println("added docs:" + i + " with " + fails + " fails");
     }
     
     public void safeStop() {