You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2011/12/16 17:57:34 UTC
svn commit: r1215220 - in /lucene/dev/branches/solrcloud/solr/core/src:
java/org/apache/solr/cloud/ java/org/apache/solr/update/
test/org/apache/solr/cloud/
Author: markrmiller
Date: Fri Dec 16 16:57:34 2011
New Revision: 1215220
URL: http://svn.apache.org/viewvc?rev=1215220&view=rev
Log:
clean up failure responses a bit
Modified:
lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/RecoveryStrat.java
lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java
lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyDistributedZkTest.java
lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullDistributedZkTest.java
Modified: lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/RecoveryStrat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/RecoveryStrat.java?rev=1215220&r1=1215219&r2=1215220&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/RecoveryStrat.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/cloud/RecoveryStrat.java Fri Dec 16 16:57:34 2011
@@ -63,7 +63,7 @@ public class RecoveryStrat {
}
public void close() {
- close = true;
+ close = true;
}
// TODO: we want to be pretty noisy if we don't properly recover?
@@ -102,6 +102,7 @@ public class RecoveryStrat {
EmbeddedSolrServer server = new EmbeddedSolrServer(core);
server.commit();
+ // nocommit: remove this
RefCounted<SolrIndexSearcher> searcher = core.getSearcher(true,
true, null);
System.out.println("DOCS AFTER REPLAY:"
Modified: lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java?rev=1215220&r1=1215219&r2=1215220&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java Fri Dec 16 16:57:34 2011
@@ -292,7 +292,7 @@ public class SolrCmdDistributor {
void checkResponses(boolean block) {
int expectedResponses = pending == null ? 0 : pending.size();
- int failedAfterConnect = 0;
+ int nonConnectionErrors = 0;
int failed = 0;
while (pending != null && pending.size() > 0) {
try {
@@ -312,9 +312,13 @@ public class SolrCmdDistributor {
// the problem is there are other exceptions thrown due to a machine going down mid connection... I've
// seen Interrupted exceptions.
+ // nocommit:
// we have to match against the msg...:(
- if (!e.getMessage().contains("java.net.ConnectException: Connection refused")) failedAfterConnect++;
-
+ if (!e.getMessage().contains(
+ "java.net.ConnectException: Connection refused")
+ || e.getMessage().contains(
+ "java.net.SocketException: Connection reset")) nonConnectionErrors++;
+
failed++;
// use the first exception encountered
// TODO: perhaps we should do more?
@@ -348,13 +352,16 @@ public class SolrCmdDistributor {
"interrupted waiting for shard update response", e);
}
}
-// if (failed > 0) {
-// System.out.println("expected:" + expectedResponses + " failed:" + failed + " failedAfterConnect:" + failedAfterConnect);
-// }
+ if (failed > 0) {
+ System.out.println("expected:" + expectedResponses + " failed:" + failed + " failedAfterConnect:" + nonConnectionErrors);
+ }
// TODO: this is a somewhat weak success guarantee - if the request was successful on every replica considered up
// and that does not return a connect exception, it was successful.
//should we optionally fail when there is only a single leader for a shard? (no replication)
- if (failed <= failedAfterConnect && failed != expectedResponses) {
+
+ // TODO: now we should tell those that failed to try and recover?
+ if (failed > 0 && nonConnectionErrors == 0) {
+ System.out.println("clear exception");
rsp.setException(null);
}
}
Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyDistributedZkTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyDistributedZkTest.java?rev=1215220&r1=1215219&r2=1215220&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyDistributedZkTest.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyDistributedZkTest.java Fri Dec 16 16:57:34 2011
@@ -22,7 +22,7 @@ import org.junit.BeforeClass;
/**
* TODO: sometimes the shards are off by a doc or two, even with the
- * retries on index failure...
+ * retries on index failure...perhaps because of leader dying mid update?
*/
public class ChaosMonkeyDistributedZkTest extends FullDistributedZkTest {
@@ -58,7 +58,7 @@ public class ChaosMonkeyDistributedZkTes
indexThread.safeStop();
// try and wait for any replications and what not to finish...
- Thread.sleep(4000);
+ Thread.sleep(5000);
commit();
Modified: lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullDistributedZkTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullDistributedZkTest.java?rev=1215220&r1=1215219&r2=1215220&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullDistributedZkTest.java (original)
+++ lucene/dev/branches/solrcloud/solr/core/src/test/org/apache/solr/cloud/FullDistributedZkTest.java Fri Dec 16 16:57:34 2011
@@ -769,6 +769,7 @@ public class FullDistributedZkTest exten
List<SolrServer> solrClients = shardToClient.get(shard);
long num = -1;
long lastNum = -1;
+ String failMessage = null;
System.out.println("\n\ncheck const");
for (SolrServer client : solrClients) {
try {
@@ -777,13 +778,17 @@ public class FullDistributedZkTest exten
if (e.getMessage().contains("Connection refused")) continue;
throw e;
}
- System.out.println("num:" + num + "\n\n");
- if (lastNum > -1 && lastNum != num) {
- fail("shard is not consistent, expected:" + lastNum + " and got:" + num);
+ System.out.println(" num:" + num + "\n");
+ if (lastNum > -1 && lastNum != num && failMessage == null) {
+ failMessage = "shard is not consistent, expected:" + lastNum + " and got:" + num;
}
lastNum = num;
}
+ if (failMessage != null) {
+ fail(failMessage);
+ }
+
// now check that the right # are on each shard
long docs = controlClient.query(new SolrQuery("*:*")).getResults().getNumFound();
Set<String> theShards = shardToClient.keySet();
@@ -919,6 +924,7 @@ public class FullDistributedZkTest exten
@Override
public void run() {
int i = startI;
+ int fails = 0;
boolean success = false;
while (true && !stop) {
success = false;
@@ -929,6 +935,7 @@ public class FullDistributedZkTest exten
"to come to the aid of their country.");
success = true;
} catch (Exception e) {
+ fails++;
// on failure, we pause and repeat
try {
sleep(10);
@@ -939,7 +946,7 @@ public class FullDistributedZkTest exten
}
}
- System.err.println("added docs:" + i);
+ System.err.println("added docs:" + i + " with " + fails + " fails");
}
public void safeStop() {