You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by Sandeep Dharembra <sa...@gmail.com> on 2020/03/04 10:48:06 UTC

Fwd: Issue With Autoscaling

Hi,

*Here are the details of what we are trying to do -*

1) Setup a solr cloud cluster using solr version 8.4.1
2) Replica type - Tlog + Pull setup
3) All Tlog replicas to be placed on one node
4) All pull replicas need to be placed on the remaining nodes
5) External Zookeeper to be used
6) On a node going down, we want the node to be deleted from the cluster
7) On node joining back the cluster, replicas for all shards should be
appropriately added back
8) Number of shards is not yet decided as of now
9) Node to host Tlog replicas will have solr running on port 8984 and the
ones hosting pull have solr running on port 8985

*Steps -*

1) Setup solr as a system service on 3 nodes - 1 Tlog and 2 Pull (Zookeeper
chroot was created)
2) Uploaded a dummy config on zookeeper
3) Start solr process on all three nodes
4) Add policy/preferences to the cluster (copied at the end of the mail)
5) Use the below commands to create the collection


/solr/admin/collections?action=CREATE&name=<name>&numShards=4&maxShardsPerNode=8&tlogReplicas=1&collection.configName=
<name>


/solr/admin/collections?action=MODIFYCOLLECTION&collection=<name>&replicationFactor=3

    /solr/admin/collections?action=ADDREPLICA&collection=<name>
&shard=shard1&type=pull&pullReplicas=2
    /solr/admin/collections?action=ADDREPLICA&collection=<name>
&shard=shard2&type=pull&pullReplicas=2
    /solr/admin/collections?action=ADDREPLICA&collection=<name>
&shard=shard3&type=pull&pullReplicas=2
    /solr/admin/collections?action=ADDREPLICA&collection=<name>
&shard=shard4&type=pull&pullReplicas=2

6) Stop solr on one of the server hosting pull replicas and wait for 2
minutes as per the trigger waitfor

    Node gets removed from the cluster - no issues here

7) Restart solr process on the same node and wait another 2 minutes

*ISSUE *

Solr starts adding pull replicas only for Shard4 on the just added node. It
keeps on doing so till the node goes OOM

If we change the policy from {"replica": "#ALL"  , "shard": "#EACH" ,
"type": PULL, "nodeset": {"port":"8985"}} TO {"replica": ">3"  , "shard":
"#EACH" , "type": PULL, "nodeset": {"port":"8985"}}, we have no problems
and pull replicas of all shards get added to the just added node.

We would like to avoid the hardcoding of numbers and percentage and want to
restrict tlogs to certain nodes which would not be serving read requests
via specifying preferred replicas for serving requests (more like master
slave architecture of solr 4.X we currently use)

Any help would be appreciated

Thanks,
Sandeep



*autoscaling.json for reference*

{
  "responseHeader":{
    "status":0,
    "QTime":20},
  "cluster-preferences":[{
      "minimize":"cores",
      "precision":1}
    ,{
      "maximize":"freedisk",
      "precision":10}
    ,{
      "minimize":"sysLoadAvg"}],
  "cluster-policy":[{
      "replica":"#ALL",
      "shard":"#EACH",
      "type":"PULL",
      "nodeset":{"port":"8985"}}
    ,{
      "replica":"#ALL",
      "type":"TLOG",
      "nodeset":{"port":"8984"}}
    ,{
      "replica":0,
      "type":"PULL",
      "nodeset":{"port":"8984"}}],
  "triggers":{
    ".scheduled_maintenance":{
      "name":".scheduled_maintenance",
      "event":"scheduled",
      "startTime":"NOW",
      "every":"+1DAY",
      "enabled":true,
      "actions":[{
          "name":"inactive_shard_plan",
          "class":"solr.InactiveShardPlanAction"},
        {
          "name":"inactive_markers_plan",
          "class":"solr.InactiveMarkersPlanAction"},
        {
          "name":"execute_plan",
          "class":"solr.ExecutePlanAction"}]},
    "node_lost_trigger":{
      "event":"nodeLost",
      "waitFor":120,
      "preferredOperation":"DELETENODE",
      "actions":[{
          "name":"compute_plan",
          "class":"solr.ComputePlanAction"},
        {
          "name":"execute_plan",
          "class":"solr.ExecutePlanAction"}]},
    "node_added_trigger":{
      "event":"nodeAdded",
      "waitFor":120,
      "preferredOperation":"ADDREPLICA",
      "replicaType":"PULL",
      "actions":[{
          "name":"compute_plan",
          "class":"solr.ComputePlanAction"},
        {
          "name":"execute_plan",
          "class":"solr.ExecutePlanAction"}]},
    ".auto_add_replicas":{
      "name":".auto_add_replicas",
      "event":"nodeLost",
      "waitFor":120,
      "enabled":true,
      "actions":[{
          "name":"auto_add_replicas_plan",
          "class":"solr.AutoAddReplicasPlanAction"},
        {
          "name":"execute_plan",
          "class":"solr.ExecutePlanAction"}]}},
  "listeners":{
    ".scheduled_maintenance.system":{
      "beforeAction":[],
      "afterAction":[],
      "stage":["STARTED",
        "ABORTED",
        "SUCCEEDED",
        "FAILED",
        "BEFORE_ACTION",
        "AFTER_ACTION",
        "IGNORED"],
      "trigger":".scheduled_maintenance",
      "class":"org.apache.solr.cloud.autoscaling.SystemLogListener"},
    "node_added_trigger.system":{
      "beforeAction":[],
      "afterAction":[],
      "stage":["STARTED",
        "ABORTED",
        "SUCCEEDED",
        "FAILED",
        "BEFORE_ACTION",
        "AFTER_ACTION",
        "IGNORED"],
      "trigger":"node_added_trigger",
      "class":"org.apache.solr.cloud.autoscaling.SystemLogListener"},
    "NodeAdded":{
      "beforeAction":[],
      "afterAction":[],
      "trigger":"node_added_trigger",
      "stage":["STARTED",
        "ABORTED",
        "SUCCEEDED",
        "FAILED"],
      "class":"solr.SystemLogListener"},
    "NodeRemoved":{
      "beforeAction":[],
      "afterAction":[],
      "trigger":"node_lost_trigger",
      "stage":["STARTED",
        "ABORTED",
        "SUCCEEDED",
        "FAILED"],
      "class":"solr.SystemLogListener"},
    "node_lost_trigger.system":{
      "beforeAction":[],
      "afterAction":[],
      "stage":["STARTED",
        "ABORTED",
        "SUCCEEDED",
        "FAILED",
        "BEFORE_ACTION",
        "AFTER_ACTION",
        "IGNORED"],
      "trigger":"node_lost_trigger",
      "class":"org.apache.solr.cloud.autoscaling.SystemLogListener"},
    ".auto_add_replicas.system":{
      "beforeAction":[],
      "afterAction":[],
      "stage":["STARTED",
        "ABORTED",
        "SUCCEEDED",
        "FAILED",
        "BEFORE_ACTION",
        "AFTER_ACTION",
        "IGNORED"],
      "trigger":".auto_add_replicas",
      "class":"org.apache.solr.cloud.autoscaling.SystemLogListener"}},
  "properties":{},
  "WARNING":"This response format is experimental.  It is likely to
change in the future."}