You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by ds...@apache.org on 2023/01/10 05:59:58 UTC
[solr] 01/03: DocRouter: strengthen abstraction (#1215)

This is an automated email from the ASF dual-hosted git repository.

dsmiley pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git

commit e4b066ee5168df0e72ba5ad4e97ac206e7ee9a17
Author: David Smiley <ds...@salesforce.com>
AuthorDate: Mon Jan 9 17:38:22 2023 -0500

    DocRouter: strengthen abstraction (#1215)
    
    Background: DocRouter is an abstraction with 3 implementations; it isn't pluggable.  There are a number of spots, especially with splits, that were making assumptions of how CompositeIdRouter parsed IDs.
    
    Strengthen the separation of concerns so that the split code can do its job, delegating to the DocRouter (specifically a CompositeIdRouter) on how to parse doc IDs.  It also makes CompositeIdRouter more extendable, but don't add subclasses or plug-ability.
---
 .../solr/cloud/api/collections/MigrateCmd.java     |  5 +-
 .../org/apache/solr/handler/admin/SplitOp.java     | 63 ++++++++++----------
 .../org/apache/solr/update/SolrIndexSplitter.java  | 22 +++----
 .../processor/DistributedZkUpdateProcessor.java    |  3 +-
 .../solr/handler/admin/SplitHandlerTest.java       |  2 +-
 .../solr/common/cloud/CompositeIdRouter.java       | 68 ++++++++++++++++++++--
 .../org/apache/solr/common/cloud/DocRouter.java    |  8 +--
 7 files changed, 112 insertions(+), 59 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/MigrateCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/MigrateCmd.java
index 70c45027ae4..4ee59d13ca9 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/MigrateCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/MigrateCmd.java
@@ -53,7 +53,6 @@ import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.Utils;
 import org.apache.solr.handler.component.ShardHandler;
-import org.apache.solr.update.SolrIndexSplitter;
 import org.apache.solr.util.TimeOut;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -253,7 +252,7 @@ public class MigrateCmd implements CollApiCmds.CollectionApiCommand {
             SHARD_ID_PROP,
             sourceSlice.getName(),
             "routeKey",
-            SolrIndexSplitter.getRouteKey(splitKey) + "!",
+            sourceRouter.getRouteKeyNoSuffix(splitKey) + "!",
             "range",
             splitRange.toString(),
             "targetCollection",
@@ -283,7 +282,7 @@ public class MigrateCmd implements CollApiCmds.CollectionApiCommand {
       sourceSlice = sourceCollection.getSlice(sourceSlice.getName());
       Map<String, RoutingRule> rules = sourceSlice.getRoutingRules();
       if (rules != null) {
-        RoutingRule rule = rules.get(SolrIndexSplitter.getRouteKey(splitKey) + "!");
+        RoutingRule rule = rules.get(sourceRouter.getRouteKeyNoSuffix(splitKey) + "!");
         if (rule != null && rule.getRouteRanges().contains(splitRange)) {
           added = true;
           break;
diff --git a/solr/core/src/java/org/apache/solr/handler/admin/SplitOp.java b/solr/core/src/java/org/apache/solr/handler/admin/SplitOp.java
index 646b19e2dc2..3b66794c5d5 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/SplitOp.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/SplitOp.java
@@ -263,8 +263,9 @@ class SplitOp implements CoreAdminHandler.CoreAdminOp {
         DocCollection collection = clusterState.getCollection(collectionName);
         String sliceName = parentCore.getCoreDescriptor().getCloudDescriptor().getShardId();
         Slice slice = collection.getSlice(sliceName);
-        DocRouter router =
-            collection.getRouter() != null ? collection.getRouter() : DocRouter.DEFAULT;
+        CompositeIdRouter router =
+            (CompositeIdRouter)
+                (collection.getRouter() != null ? collection.getRouter() : DocRouter.DEFAULT);
         DocRouter.Range currentRange = slice.getRange();
 
         Object routerObj =
@@ -354,7 +355,10 @@ class SplitOp implements CoreAdminHandler.CoreAdminOp {
    * Returns a list of range counts sorted by the range lower bound
    */
   static Collection<RangeCount> getHashHistogram(
-      SolrIndexSearcher searcher, String prefixField, DocRouter router, DocCollection collection)
+      SolrIndexSearcher searcher,
+      String prefixField,
+      CompositeIdRouter router,
+      DocCollection collection)
       throws IOException {
     RTimer timer = new RTimer();
     TreeMap<DocRouter.Range, RangeCount> counts = new TreeMap<>();
@@ -374,14 +378,18 @@ class SplitOp implements CoreAdminHandler.CoreAdminOp {
     while ((term = termsEnum.next()) != null) {
       numPrefixes++;
 
-      String termStr = term.utf8ToString();
-      int firstSep = termStr.indexOf(CompositeIdRouter.SEPARATOR);
       // truncate to first separator since we don't support multiple levels currently
       // NOTE: this does not currently work for tri-level composite ids since the number of bits
       // allocated to the first ID is 16 for a 2 part id and 8 for a 3 part id!
-      if (firstSep != termStr.length() - 1 && firstSep > 0) {
-        numTriLevel++;
-        termStr = termStr.substring(0, firstSep + 1);
+      String termStr;
+      int routeKeyLen = router.getRouteKeyWithSeparator(term.bytes, term.offset, term.length);
+      if (routeKeyLen == 0 || routeKeyLen == term.length) {
+        termStr = term.utf8ToString();
+      } else {
+        int prevLen = term.length;
+        term.length = routeKeyLen;
+        termStr = term.utf8ToString();
+        term.length = prevLen; // restore    (Question: must we do this?)
       }
 
       DocRouter.Range range = router.getSearchRangeSingle(termStr, null, collection);
@@ -418,7 +426,10 @@ class SplitOp implements CoreAdminHandler.CoreAdminOp {
    * (i.e. the terms are full IDs, not just prefixes)
    */
   static Collection<RangeCount> getHashHistogramFromId(
-      SolrIndexSearcher searcher, String idField, DocRouter router, DocCollection collection)
+      SolrIndexSearcher searcher,
+      String idField,
+      CompositeIdRouter router,
+      DocCollection collection)
       throws IOException {
     RTimer timer = new RTimer();
 
@@ -433,9 +444,8 @@ class SplitOp implements CoreAdminHandler.CoreAdminOp {
     int numCollisions = 0;
     long sumBuckets = 0;
 
-    byte sep = (byte) CompositeIdRouter.SEPARATOR.charAt(0);
     TermsEnum termsEnum = terms.iterator();
-    BytesRef currPrefix = new BytesRef(); // prefix of the previous "id" term
+    BytesRef currPrefix = new BytesRef(); // prefix of the previous "id" term WITH SEPARATOR
     int bucketCount = 0; // count of the number of docs in the current bucket
 
     // We're going to iterate over all terms, so do the minimum amount of work per term.
@@ -445,7 +455,9 @@ class SplitOp implements CoreAdminHandler.CoreAdminOp {
       BytesRef term = termsEnum.next();
 
       // compare to current prefix bucket and see if this new term shares the same prefix
-      if (term != null && term.length >= currPrefix.length && currPrefix.length > 0) {
+      if (term != null && currPrefix.length > 0) {
+        // since currPrefix includes the trailing separator, we can assume startsWith is a
+        // sufficient test
         if (StringHelper.startsWith(term, currPrefix)) {
           bucketCount++; // use 1 since we are dealing with unique ids
           continue;
@@ -474,25 +486,16 @@ class SplitOp implements CoreAdminHandler.CoreAdminOp {
       // if the current term is null, we ran out of values
       if (term == null) break;
 
-      // find the new prefix (if any)
-
-      // resize if needed
-      if (currPrefix.length < term.length) {
-        currPrefix.bytes = new byte[term.length + 10];
-      }
-
-      // Copy the bytes up to and including the separator, and set the length if the separator is
-      // found. If there was no separator, then length remains 0 and it's the indicator that we have
-      // no prefix bucket
-      currPrefix.length = 0;
-      for (int i = 0; i < term.length; i++) {
-        byte b = term.bytes[i + term.offset];
-        currPrefix.bytes[i] = b;
-        if (b == sep) {
-          currPrefix.length = i + 1;
-          bucketCount++;
-          break;
+      // find the new prefix (if any), with trailing separator
+      currPrefix.length = router.getRouteKeyWithSeparator(term.bytes, term.offset, term.length);
+      if (currPrefix.length > 0) {
+        // resize if needed
+        if (currPrefix.length > currPrefix.bytes.length) {
+          currPrefix.bytes = new byte[currPrefix.length + 10];
         }
+        System.arraycopy(term.bytes, term.offset, currPrefix.bytes, 0, currPrefix.length);
+
+        bucketCount++;
       }
     }
 
diff --git a/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java b/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java
index a285f79e73f..61ba505ddd5 100644
--- a/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java
+++ b/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java
@@ -133,7 +133,8 @@ public class SolrIndexSplitter {
     if (cmd.splitKey == null) {
       splitKey = null;
     } else {
-      splitKey = getRouteKey(cmd.splitKey);
+      checkRouterSupportsSplitKey(hashRouter, cmd.splitKey);
+      splitKey = ((CompositeIdRouter) hashRouter).getRouteKeyNoSuffix(cmd.splitKey);
     }
     if (cmd.cores == null) {
       this.splitMethod = SplitMethod.REWRITE;
@@ -647,6 +648,7 @@ public class SolrIndexSplitter {
       AtomicInteger currentPartition,
       boolean delete)
       throws IOException {
+    checkRouterSupportsSplitKey(hashRouter, splitKey);
     LeafReader reader = readerContext.reader();
     FixedBitSet[] docSets = new FixedBitSet[numPieces];
     for (int i = 0; i < docSets.length; i++) {
@@ -689,8 +691,7 @@ public class SolrIndexSplitter {
       String idString = idRef.toString();
 
       if (splitKey != null) {
-        // todo have composite routers support these kind of things instead
-        String part1 = getRouteKey(idString);
+        String part1 = ((CompositeIdRouter) hashRouter).getRouteKeyNoSuffix(idString);
         if (part1 == null) continue;
         if (!splitKey.equals(part1)) {
           continue;
@@ -765,18 +766,11 @@ public class SolrIndexSplitter {
     return docSets;
   }
 
-  public static String getRouteKey(String idString) {
-    int idx = idString.indexOf(CompositeIdRouter.SEPARATOR);
-    if (idx <= 0) return null;
-    String part1 = idString.substring(0, idx);
-    int commaIdx = part1.indexOf(CompositeIdRouter.bitsSeparator);
-    if (commaIdx > 0 && commaIdx + 1 < part1.length()) {
-      char ch = part1.charAt(commaIdx + 1);
-      if (ch >= '0' && ch <= '9') {
-        part1 = part1.substring(0, commaIdx);
-      }
+  private static void checkRouterSupportsSplitKey(HashBasedRouter hashRouter, String splitKey) {
+    if (splitKey != null && !(hashRouter instanceof CompositeIdRouter)) {
+      throw new IllegalStateException(
+          "splitKey isn't supported for router " + hashRouter.getClass());
     }
-    return part1;
   }
 
   // change livedocs on the reader to delete those docs we don't want
diff --git a/solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java
index fbe89388192..c1eee8572c2 100644
--- a/solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java
+++ b/solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java
@@ -67,7 +67,6 @@ import org.apache.solr.update.DeleteUpdateCommand;
 import org.apache.solr.update.MergeIndexesCommand;
 import org.apache.solr.update.RollbackUpdateCommand;
 import org.apache.solr.update.SolrCmdDistributor;
-import org.apache.solr.update.SolrIndexSplitter;
 import org.apache.solr.update.UpdateCommand;
 import org.apache.solr.util.TestInjection;
 import org.apache.zookeeper.KeeperException;
@@ -1067,7 +1066,7 @@ public class DistributedZkUpdateProcessor extends DistributedUpdateProcessor {
           return nodes;
         }
 
-        String routeKey = SolrIndexSplitter.getRouteKey(id);
+        String routeKey = compositeIdRouter.getRouteKeyNoSuffix(id);
         if (routeKey != null) {
           RoutingRule rule = routingRules.get(routeKey + "!");
           if (rule != null) {
diff --git a/solr/core/src/test/org/apache/solr/handler/admin/SplitHandlerTest.java b/solr/core/src/test/org/apache/solr/handler/admin/SplitHandlerTest.java
index f3f1f69322f..011f5fe3885 100644
--- a/solr/core/src/test/org/apache/solr/handler/admin/SplitHandlerTest.java
+++ b/solr/core/src/test/org/apache/solr/handler/admin/SplitHandlerTest.java
@@ -239,7 +239,7 @@ public class SplitHandlerTest extends SolrTestCaseJ4 {
 
     String prefixField = "id_prefix_s";
     String idField = "id";
-    DocRouter router = new CompositeIdRouter();
+    CompositeIdRouter router = new CompositeIdRouter();
 
     for (int i = 0; i < 100; i++) {
       SolrQueryRequest req = req("myquery");
diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/CompositeIdRouter.java b/solr/solrj/src/java/org/apache/solr/common/cloud/CompositeIdRouter.java
index dd0b6669195..9094ddd84e8 100644
--- a/solr/solrj/src/java/org/apache/solr/common/cloud/CompositeIdRouter.java
+++ b/solr/solrj/src/java/org/apache/solr/common/cloud/CompositeIdRouter.java
@@ -81,12 +81,57 @@ import org.apache.solr.common.util.Hash;
 public class CompositeIdRouter extends HashBasedRouter {
   public static final String NAME = "compositeId";
 
-  public static final String SEPARATOR = "!";
+  // TODO standardize naming: routeKey (probably) or shardKey or key; pick one.
+
+  /**
+   * This character separates a composite ID into a leading route key and the rest.
+   *
+   * <p>Importantly, it's also used at the end of a provided route key parameter (which appears in
+   * many places) to designate a hash range which translates to a list of slices. If a route key
+   * does not end with this character, then semantically the key points to a single slice that holds
+   * a doc with that ID.
+   */
+  public static final char SEPARATOR = '!';
 
   // separator used to optionally specify number of bits to allocate toward first part.
   public static final int bitsSeparator = '/';
   private int bits = 16;
 
+  /**
+   * Parse out the route key from {@code id} up to and including the {@link #SEPARATOR}, returning
+   * it's length. If no route key is detected then 0 is returned.
+   */
+  public int getRouteKeyWithSeparator(byte[] id, int idOffset, int idLength) {
+    final byte SEPARATOR_BYTE = (byte) CompositeIdRouter.SEPARATOR;
+    for (int i = 0; i < idLength; i++) {
+      byte b = id[idOffset + i];
+      if (b == SEPARATOR_BYTE) {
+        return i + 1;
+      }
+    }
+    return 0;
+  }
+
+  /**
+   * Parse out the route key from {@code id}. It will not have the "bits" suffix or separator, if
+   * present. If there is no route key found then null is returned.
+   */
+  public String getRouteKeyNoSuffix(String id) {
+    int idx = id.indexOf(SEPARATOR);
+    if (idx <= 0) {
+      return null;
+    }
+    String part1 = id.substring(0, idx);
+    int commaIdx = part1.indexOf(bitsSeparator);
+    if (commaIdx > 0 && commaIdx + 1 < part1.length()) {
+      char ch = part1.charAt(commaIdx + 1);
+      if (ch >= '0' && ch <= '9') {
+        part1 = part1.substring(0, commaIdx);
+      }
+    }
+    return part1;
+  }
+
   @Override
   public int sliceHash(
       String id, SolrInputDocument doc, SolrParams params, DocCollection collection) {
@@ -132,6 +177,8 @@ public class CompositeIdRouter extends HashBasedRouter {
    * @return Range for given routeKey
    */
   public Range keyHashRange(String routeKey) {
+    routeKey = preprocessRouteKey(routeKey);
+
     if (routeKey.indexOf(SEPARATOR) < 0) {
       int hash = sliceHash(routeKey, null, null, null);
       return new Range(hash, hash);
@@ -147,6 +194,8 @@ public class CompositeIdRouter extends HashBasedRouter {
       return fullRange();
     }
 
+    shardKey = preprocessRouteKey(shardKey);
+
     if (shardKey.indexOf(SEPARATOR) < 0) {
       // shardKey is a simple id, so don't do a range
       int hash = Hash.murmurhash3_x86_32(shardKey, 0, shardKey.length(), 0);
@@ -164,7 +213,8 @@ public class CompositeIdRouter extends HashBasedRouter {
       // TODO: this may need modification in the future when shard splitting could cause an overlap
       return collection.getActiveSlices();
     }
-    String id = shardKey;
+
+    String id = preprocessRouteKey(shardKey);
 
     if (shardKey.indexOf(SEPARATOR) < 0) {
       // shardKey is a simple id, so don't do a range
@@ -185,6 +235,14 @@ public class CompositeIdRouter extends HashBasedRouter {
     return targetSlices;
   }
 
+  /**
+   * Methods accepting a route key (shard key) can have this input preprocessed by a subclass before
+   * further analysis.
+   */
+  protected String preprocessRouteKey(String shardKey) {
+    return shardKey;
+  }
+
   @Override
   public String getName() {
     return NAME;
@@ -266,7 +324,7 @@ public class CompositeIdRouter extends HashBasedRouter {
   }
 
   /** Helper class to calculate parts, masks etc for an id. */
-  static class KeyParser {
+  protected static class KeyParser {
     String key;
     int[] numBits;
     int[] hashes;
@@ -333,7 +391,7 @@ public class CompositeIdRouter extends HashBasedRouter {
       masks = getMasks();
     }
 
-    Range getRange() {
+    public Range getRange() {
       int lowerBound;
       int upperBound;
 
@@ -395,7 +453,7 @@ public class CompositeIdRouter extends HashBasedRouter {
       return masks;
     }
 
-    int getHash() {
+    public int getHash() {
       int result = hashes[0] & masks[0];
 
       for (int i = 1; i < pieces; i++) result = result | (hashes[i] & masks[i]);
diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/DocRouter.java b/solr/solrj/src/java/org/apache/solr/common/cloud/DocRouter.java
index af0ebc66f12..490a0a21eda 100644
--- a/solr/solrj/src/java/org/apache/solr/common/cloud/DocRouter.java
+++ b/solr/solrj/src/java/org/apache/solr/common/cloud/DocRouter.java
@@ -39,7 +39,7 @@ import org.noggit.JSONWriter;
  */
 public abstract class DocRouter {
   public static final String DEFAULT_NAME = CompositeIdRouter.NAME;
-  public static final DocRouter DEFAULT = new CompositeIdRouter();
+  public static final DocRouter DEFAULT;
 
   public static DocRouter getDocRouter(String routerName) {
     DocRouter router = routerMap.get(routerName);
@@ -79,11 +79,11 @@ public abstract class DocRouter {
     // to "plain" if it doesn't have any properties.
     routerMap.put(null, plain); // back compat with 4.0
     routerMap.put(PlainIdRouter.NAME, plain);
-    routerMap.put(
-        CompositeIdRouter.NAME,
-        DEFAULT_NAME.equals(CompositeIdRouter.NAME) ? DEFAULT : new CompositeIdRouter());
+    routerMap.put(CompositeIdRouter.NAME, new CompositeIdRouter());
     routerMap.put(ImplicitDocRouter.NAME, new ImplicitDocRouter());
     // NOTE: careful that the map keys (the static .NAME members) are filled in by making them final
+
+    DEFAULT = routerMap.get(DEFAULT_NAME);
   }
 
   // Hash ranges can't currently "wrap" - i.e. max must be greater or equal to min.