You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@trafficserver.apache.org by "Sudheer Vinukonda (JIRA)" <ji...@apache.org> on 2015/07/14 20:11:04 UTC
[jira] [Commented] (TS-3767) More unbounded retries within read-while-writer breaking loop detection.

    [ https://issues.apache.org/jira/browse/TS-3767?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14626795#comment-14626795 ] 

Sudheer Vinukonda commented on TS-3767:
---------------------------------------

Below is a proposed patch:

{code}
diff --git a/iocore/cache/Cache.cc b/iocore/cache/Cache.cc
index 330aedd..f0610e4 100644
--- a/iocore/cache/Cache.cc
+++ b/iocore/cache/Cache.cc
@@ -80,6 +80,7 @@ int cache_config_enable_checksum = 0;
 int cache_config_alt_rewrite_max_size = 4096;
 int cache_config_read_while_writer = 0;
 int cache_config_mutex_retry_delay = 2;
+int cache_config_writer_retry_delay = 50;
 int cache_config_read_while_writer_max_retries = 10;
 #ifdef HTTP_CACHE
 static int enable_cache_empty_http_doc = 0;
@@ -3191,6 +3192,9 @@ ink_cache_init(ModuleVersion v)
   REC_EstablishStaticConfigInt32(cache_config_read_while_writer_max_retries, "proxy.config.cache.read_while_writer.max_retries");
   Debug("cache_init", "proxy.config.cache.read_while_writer.max_retries = %d", cache_config_read_while_writer_max_retries);
 
+  REC_EstablishStaticConfigInt32(cache_config_writer_retry_delay, "proxy.config.cache.writer_retry_delay");
+  Debug("cache_init", "proxy.config.cache.writer_retry_delay = %dms", cache_config_writer_retry_delay);
+
   REC_EstablishStaticConfigInt32(cache_config_hit_evacuate_percent, "proxy.config.cache.hit_evacuate_percent");
   Debug("cache_init", "proxy.config.cache.hit_evacuate_percent = %d", cache_config_hit_evacuate_percent);
 
diff --git a/iocore/cache/CacheRead.cc b/iocore/cache/CacheRead.cc
index c617ee4..2cbb3fe 100644
--- a/iocore/cache/CacheRead.cc
+++ b/iocore/cache/CacheRead.cc
@@ -336,7 +336,11 @@ CacheVC::openReadFromWriter(int event, Event *e)
       return openReadStartHead(event, e);
     } else if (ret == EVENT_CONT) {
       ink_assert(!write_vc);
-      VC_SCHED_WRITER_RETRY();
+      if (writer_lock_retry < cache_config_read_while_writer_max_retries) {
+        VC_SCHED_WRITER_RETRY();
+      } else {
+        return openReadFromWriterFailure(CACHE_EVENT_OPEN_READ_FAILED, (Event *) - err);
+      }
     } else
       ink_assert(write_vc);
   } else {
@@ -587,8 +591,13 @@ CacheVC::openReadReadDone(int event, Event *e)
         DDebug("cache_read_agg", "%p: key: %X ReadRead writer aborted: %d", this, first_key.slice32(1), (int)vio.ndone);
         goto Lerror;
       }
-      DDebug("cache_read_agg", "%p: key: %X ReadRead retrying: %d", this, first_key.slice32(1), (int)vio.ndone);
-      VC_SCHED_WRITER_RETRY(); // wait for writer
+      if (writer_lock_retry < cache_config_read_while_writer_max_retries) {
+        DDebug("cache_read_agg", "%p: key: %X ReadRead retrying: %d", this, first_key.slice32(1), (int)vio.ndone);
+        VC_SCHED_WRITER_RETRY(); // wait for writer
+      } else {
+        DDebug("cache_read_agg", "%p: key: %X ReadRead retries exhausted, bailing..: %d", this, first_key.slice32(1), (int)vio.ndone);
+        goto Ldone;
+      }
     }
     // fall through for truncated documents
   }
diff --git a/iocore/cache/P_CacheInternal.h b/iocore/cache/P_CacheInternal.h
index eb3a161..000e014 100644
--- a/iocore/cache/P_CacheInternal.h
+++ b/iocore/cache/P_CacheInternal.h
@@ -59,8 +59,6 @@ struct EvacuationBlock;
 #endif
 
 #define AIO_SOFT_FAILURE -100000
-// retry read from writer delay
-#define WRITER_RETRY_DELAY HRTIME_MSECONDS(50)
 
 #ifndef CACHE_LOCK_FAIL_RATE
 #define CACHE_TRY_LOCK(_l, _m, _t) MUTEX_TRY_LOCK(_l, _m, _t)
@@ -96,13 +94,9 @@ struct EvacuationBlock;
   do {                                                            \
     ink_assert(!trigger);                                         \
     writer_lock_retry++;                                          \
-    ink_hrtime _t = WRITER_RETRY_DELAY;                           \
+    ink_hrtime _t = cache_config_writer_retry_delay;              \
     if (writer_lock_retry > 2)                                    \
-      _t = WRITER_RETRY_DELAY * 2;                                \
-    else if (writer_lock_retry > 5)                               \
-      _t = WRITER_RETRY_DELAY * 10;                               \
-    else if (writer_lock_retry > 10)                              \
-      _t = WRITER_RETRY_DELAY * 100;                              \
+      _t = cache_config_writer_retry_delay * 2;                   \
     trigger = mutex->thread_holding->schedule_in_local(this, _t); \
     return EVENT_CONT;                                            \
   } while (0)
@@ -221,6 +215,7 @@ extern int cache_config_hit_evacuate_size_limit;
 extern int cache_config_force_sector_size;
 extern int cache_config_target_fragment_size;
 extern int cache_config_mutex_retry_delay;
+extern int cache_config_writer_retry_delay;
 extern int cache_config_read_while_writer_max_retries;
 
 // CacheVC
diff --git a/mgmt/RecordsConfig.cc b/mgmt/RecordsConfig.cc
index 4d847b0..49c2ce4 100644
--- a/mgmt/RecordsConfig.cc
+++ b/mgmt/RecordsConfig.cc
@@ -966,6 +966,8 @@ static const RecordElement RecordsConfig[] =
   ,
   {RECT_CONFIG, "proxy.config.cache.read_while_writer.max_retries", RECD_INT, "10", RECU_DYNAMIC, RR_NULL, RECC_NULL, NULL, RECA_NULL}
   ,
+  {RECT_CONFIG, "proxy.config.cache.writer_retry_delay", RECD_INT, "50", RECU_DYNAMIC, RR_NULL, RECC_NULL, NULL, RECA_NULL}
+  ,
 
   //##############################################################################
   //#
{code}

> More unbounded retries within read-while-writer breaking loop detection.
> ------------------------------------------------------------------------
>
>                 Key: TS-3767
>                 URL: https://issues.apache.org/jira/browse/TS-3767
>             Project: Traffic Server
>          Issue Type: Bug
>          Components: Cache
>            Reporter: Sudheer Vinukonda
>              Labels: A
>             Fix For: 6.1.0
>
>
> [~zwoop] noticed the loop detection (request to self) fails with default settings, when read-while-writer functionality is enabled. Upon further investigation, this seems to be caused by more cases of unbounded retries within read-while-writer (similar to TS-3622), which prevent reaching the loop detection point (currently, done after the cache lookup state).
> TS-3622 added a bound for read-while-writer in the case, where the writer lock is available, but, the first fragment is not downloaded yet, but, there are more cases which cause similar issues. 
> Discussing with [~zwoop], we think we should add bounds in all such cases with configurable timer (duration) and configurable max number of retries.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)