You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@solr.apache.org by GitBox <gi...@apache.org> on 2021/08/10 22:15:50 UTC

[GitHub] [solr] sonatype-lift[bot] commented on a change in pull request #254: Solr 15581

sonatype-lift[bot] commented on a change in pull request #254:
URL: https://github.com/apache/solr/pull/254#discussion_r686357228



##########
File path: solr/benchmark/src/java/org/apache/solr/bench/FieldDefValueGenerator.java
##########
@@ -0,0 +1,242 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.bench;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Queue;
+import java.util.Random;
+import java.util.SplittableRandom;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.atomic.AtomicInteger;
+import org.apache.commons.lang3.RandomStringUtils;
+import org.apache.commons.lang3.Validate;
+import org.apache.lucene.util.TestUtil;
+import org.apache.solr.common.SolrInputDocument;
+
+/** */
+public class FieldDefValueGenerator implements ValueGenerator {
+
+  private final FieldDef fieldDef;
+  private Queue<SolrInputDocument> docs = new ConcurrentLinkedQueue<>();
+
+  private final Map<String, FieldDef> fields = new HashMap<>();
+
+  private static final AtomicInteger ID = new AtomicInteger();
+
+  private ExecutorService executorService;
+  private volatile Object lastValue;
+
+  public FieldDefValueGenerator(FieldDef fieldDef) {
+    this.fieldDef = fieldDef;
+  }
+
+  @Override
+  public Object getNextValue(SplittableRandom random) {
+    Object value;
+    switch (fieldDef.getContent()) {
+      case UNIQUE_INT:
+        value = ID.incrementAndGet();
+        break;
+      case INTEGER:
+        if (fieldDef.getMaxCardinality() > 0) {
+          long start = fieldDef.getCardinalityStart();
+          long seed = nextLong(start, start + fieldDef.getMaxCardinality(), random.split());
+          value = Integer.toString(nextInt(0, Integer.MAX_VALUE, new SplittableRandom(seed)));
+          break;
+        }
+
+        value = Integer.toString(random.nextInt(Integer.MAX_VALUE));
+        break;
+      case ALPHEBETIC:
+        value = getString(fieldDef, val -> getAlphabeticString(fieldDef, random), random);
+        break;
+      case UNICODE:
+        value = getString(fieldDef, val -> getUnicodeString(fieldDef, random), random);
+        break;
+      default:
+        throw new UnsupportedOperationException(
+            "Unsupported content type type=" + fieldDef.getContent());
+    }
+
+    lastValue = value;
+    return value;
+  }
+
+  @Override
+  public Object getLastValue() {
+    return lastValue;
+  }
+
+  private String getString(
+      FieldDef fieldDef, StringSupplier supplier, SplittableRandom threadRandom) {
+    if (fieldDef.getNumTokens() > 1 || fieldDef.getMaxNumTokens() > 1) {
+      StringBuilder sb =
+          new StringBuilder(
+              fieldDef.getNumTokens()
+                  * (Math.max(fieldDef.getLength(), fieldDef.getMaxLength()) + 1));
+      SplittableRandom random = threadRandom.split();
+      for (int i = 0;
+          i
+              < (fieldDef.getMaxNumTokens() > 1
+                  ? random.nextInt(1, fieldDef.getMaxNumTokens())
+                  : fieldDef.getNumTokens());
+          i++) {
+        if (i > 0) {
+          sb.append(' ');
+        }
+        sb.append(supplier.getString(fieldDef));
+      }
+      return sb.toString();
+    }
+    return supplier.getString(fieldDef);
+  }
+
+  private String getUnicodeString(FieldDef fieldDef, SplittableRandom threadRandom) {
+    try {
+      if (fieldDef.getMaxCardinality() > 0) {
+        long start = fieldDef.getCardinalityStart();
+        long seed = nextLong(start, start + fieldDef.getMaxCardinality(), threadRandom.split());
+        if (fieldDef.getLength() > -1) {
+          return TestUtil.randomRealisticUnicodeString(
+              new Random(seed), fieldDef.getLength(), fieldDef.getLength());
+        } else {
+          return TestUtil.randomRealisticUnicodeString(
+              new Random(seed), 1, fieldDef.getMaxLength());
+        }
+      }
+
+      if (fieldDef.getLength() > -1) {
+        return TestUtil.randomRealisticUnicodeString(
+            new Random(threadRandom.nextLong()), fieldDef.getLength(), fieldDef.getLength());
+      } else {
+        return TestUtil.randomRealisticUnicodeString(
+            new Random(threadRandom.nextLong()), 1, fieldDef.getMaxLength());
+      }
+    } catch (Exception e) {
+      throw new RuntimeException("Failed getting UnicodeString with FieldDef=" + fieldDef, e);
+    }
+  }
+
+  private String getAlphabeticString(FieldDef fieldDef, SplittableRandom threadRandom) {
+    try {
+      if (fieldDef.getMaxCardinality() > 0) {
+        long start = fieldDef.getCardinalityStart();
+        long seed = nextLong(start, start + fieldDef.getMaxCardinality(), threadRandom.split());
+        SplittableRandom random = new SplittableRandom(seed);
+        if (fieldDef.getLength() > -1) {
+          return RandomStringUtils.random(
+              nextInt(fieldDef.getLength(), fieldDef.getLength(), random),

Review comment:
       *PREDICTABLE_RANDOM:*  This random generator (java.util.Random) is predictable [(details)](https://find-sec-bugs.github.io/bugs.htm#PREDICTABLE_RANDOM)
   (at-me [in a reply](https://help.sonatype.com/lift) with `help` or `ignore`)

##########
File path: solr/benchmark/src/java/org/apache/solr/bench/UniformLongGenerator.java
##########
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.bench;
+
+import java.util.SplittableRandom;
+import java.util.concurrent.ThreadLocalRandom;
+
+/** Generates longs randomly uniform from an interval. */
+public class UniformLongGenerator implements ValueGenerator {
+  private final long lb, ub, interval;
+  private long lastValue;
+
+  /**
+   * Creates a generator that will return longs uniformly randomly from the interval [lb,ub]
+   * inclusive (that is, lb and ub are possible values) (lb and ub are possible values).
+   *
+   * @param lb the lower bound (inclusive) of generated values
+   * @param ub the upper bound (inclusive) of generated values
+   */
+  public UniformLongGenerator(long lb, long ub) {
+    this.lb = lb;
+    this.ub = ub;
+    interval = this.ub - this.lb + 1;
+  }
+
+  @Override
+  public Long getNextValue(SplittableRandom random) {
+    long ret = Math.abs(ThreadLocalRandom.current().nextLong()) % interval + lb;

Review comment:
       *PREDICTABLE_RANDOM:*  This random generator (java.util.concurrent.ThreadLocalRandom) is predictable [(details)](https://find-sec-bugs.github.io/bugs.htm#PREDICTABLE_RANDOM)
   (at-me [in a reply](https://help.sonatype.com/lift) with `help` or `ignore`)

##########
File path: solr/benchmark/src/java/org/apache/solr/bench/ZipfianGenerator.java
##########
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.bench;
+
+import java.util.SplittableRandom;
+
+/**
+ * A generator of a zipfian distribution. It produces a sequence of items, such that some items are
+ * more popular than others, according to a zipfian distribution. When you construct an instance of
+ * this class, you specify the number of items in the set to draw from, either by specifying an
+ * itemcount (so that the sequence is of items from 0 to itemcount-1) or by specifying a min and a
+ * max (so that the sequence is of items from min to max inclusive). After you construct the
+ * instance, you can change the number of items by calling nextInt(itemcount) or
+ * nextLong(itemcount).
+ *
+ * <p>Note that the popular items will be clustered together, e.g. item 0 is the most popular, item
+ * 1 the second most popular, and so on (or min is the most popular, min+1 the next most popular,
+ * etc.) If you don't want this clustering, and instead want the popular items scattered throughout
+ * the item space, then use ScrambledZipfianGenerator instead.
+ *
+ * <p>Be aware: initializing this generator may take a long time if there are lots of items to
+ * choose from (e.g. over a minute for 100 million objects). This is because certain mathematical
+ * values need to be computed to properly generate a zipfian skew, and one of those values (zeta) is
+ * a sum sequence from 1 to n, where n is the itemcount. Note that if you increase the number of
+ * items in the set, we can compute a new zeta incrementally, so it should be fast unless you have
+ * added millions of items. However, if you decrease the number of items, we recompute zeta from
+ * scratch, so this can take a long time.
+ *
+ * <p>The algorithm used here is from "Quickly Generating Billion-Record Synthetic Databases", Jim
+ * Gray et al, SIGMOD 1994.
+ */
+public class ZipfianGenerator implements ValueGenerator {
+  public static final double ZIPFIAN_CONSTANT = 0.99;
+
+  /** Number of items. */
+  private final long items;
+
+  /** Min item to generate. */
+  private final long base;
+
+  /** The zipfian constant to use. */
+  private final double zipfianconstant;
+
+  /** Computed parameters for generating the distribution. */
+  private double alpha, zetan, eta, theta, zeta2theta;
+
+  /** The number of items used to compute zetan the last time. */
+  private long countforzeta;
+
+  /**
+   * Flag to prevent problems. If you increase the number of items the zipfian generator is allowed
+   * to choose from, this code will incrementally compute a new zeta value for the larger itemcount.
+   * However, if you decrease the number of items, the code computes zeta from scratch; this is
+   * expensive for large itemsets. Usually this is not intentional; e.g. one thread thinks the
+   * number of items is 1001 and calls "nextLong()" with that item count; then another thread who
+   * thinks the number of items is 1000 calls nextLong() with itemcount=1000 triggering the
+   * expensive recomputation. (It is expensive for 100 million items, not really for 1000 items.)
+   * Why did the second thread think there were only 1000 items? maybe it read the item count before
+   * the first thread incremented it. So this flag allows you to say if you really do want that
+   * recomputation. If true, then the code will recompute zeta if the itemcount goes down. If false,
+   * the code will assume itemcount only goes up, and never recompute.
+   */
+  private boolean allowitemcountdecrease = false;
+
+  private long lastValue;
+
+  /******************************* Constructors **************************************/
+
+  /**
+   * Create a zipfian generator for the specified number of items.
+   *
+   * @param items The number of items in the distribution.
+   */
+  public ZipfianGenerator(long items, SplittableRandom random) {
+    this(0, items - 1, random);
+  }
+
+  /**
+   * Create a zipfian generator for items between min and max.
+   *
+   * @param min The smallest integer to generate in the sequence.
+   * @param max The largest integer to generate in the sequence.
+   */
+  public ZipfianGenerator(long min, long max, SplittableRandom random) {
+    this(min, max, ZIPFIAN_CONSTANT, random);
+  }
+
+  /**
+   * Create a zipfian generator for the specified number of items using the specified zipfian
+   * constant.
+   *
+   * @param items The number of items in the distribution.
+   * @param zipfianconstant The zipfian constant to use.
+   */
+  public ZipfianGenerator(long items, double zipfianconstant, SplittableRandom random) {
+    this(0, items - 1, zipfianconstant, random);
+  }
+
+  /**
+   * Create a zipfian generator for items between min and max (inclusive) for the specified zipfian
+   * constant.
+   *
+   * @param min The smallest integer to generate in the sequence.
+   * @param max The largest integer to generate in the sequence.
+   * @param zipfianconstant The zipfian constant to use.
+   */
+  public ZipfianGenerator(long min, long max, double zipfianconstant, SplittableRandom random) {
+    this(min, max, zipfianconstant, zetastatic(max - min + 1, zipfianconstant), random);
+  }
+
+  /**
+   * Create a zipfian generator for items between min and max (inclusive) for the specified zipfian
+   * constant, using the precomputed value of zeta.
+   *
+   * @param min The smallest integer to generate in the sequence.
+   * @param max The largest integer to generate in the sequence.
+   * @param zipfianconstant The zipfian constant to use.
+   * @param zetan The precomputed zeta constant.
+   */
+  public ZipfianGenerator(
+      long min, long max, double zipfianconstant, double zetan, SplittableRandom random) {
+
+    items = max - min + 1;
+    base = min;
+    this.zipfianconstant = zipfianconstant;
+
+    theta = this.zipfianconstant;
+
+    zeta2theta = zeta(2, theta);
+
+    alpha = 1.0 / (1.0 - theta);
+    this.zetan = zetan;
+    countforzeta = items;
+    eta = (1 - Math.pow(2.0 / items, 1 - theta)) / (1 - zeta2theta / this.zetan);
+
+    getNextValue(random);
+  }
+
+  /**************************************************************************/
+
+  /**
+   * Compute the zeta constant needed for the distribution. Do this from scratch for a distribution
+   * with n items, using the zipfian constant thetaVal. Remember the value of n, so if we change the
+   * itemcount, we can recompute zeta.
+   *
+   * @param n The number of items to compute zeta over.
+   * @param thetaVal The zipfian constant.
+   */
+  double zeta(long n, double thetaVal) {
+    countforzeta = n;

Review comment:
       *THREAD_SAFETY_VIOLATION:*  Unprotected write. Non-private method `ZipfianGenerator.zeta(...)` writes to field `this.countforzeta` outside of synchronization.
    Reporting because another access to the same memory occurs on a background thread, although this access may not.
   (at-me [in a reply](https://help.sonatype.com/lift) with `help` or `ignore`)

##########
File path: solr/benchmark/src/java/org/apache/solr/bench/ZipfianGenerator.java
##########
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.bench;
+
+import java.util.SplittableRandom;
+
+/**
+ * A generator of a zipfian distribution. It produces a sequence of items, such that some items are
+ * more popular than others, according to a zipfian distribution. When you construct an instance of
+ * this class, you specify the number of items in the set to draw from, either by specifying an
+ * itemcount (so that the sequence is of items from 0 to itemcount-1) or by specifying a min and a
+ * max (so that the sequence is of items from min to max inclusive). After you construct the
+ * instance, you can change the number of items by calling nextInt(itemcount) or
+ * nextLong(itemcount).
+ *
+ * <p>Note that the popular items will be clustered together, e.g. item 0 is the most popular, item
+ * 1 the second most popular, and so on (or min is the most popular, min+1 the next most popular,
+ * etc.) If you don't want this clustering, and instead want the popular items scattered throughout
+ * the item space, then use ScrambledZipfianGenerator instead.
+ *
+ * <p>Be aware: initializing this generator may take a long time if there are lots of items to
+ * choose from (e.g. over a minute for 100 million objects). This is because certain mathematical
+ * values need to be computed to properly generate a zipfian skew, and one of those values (zeta) is
+ * a sum sequence from 1 to n, where n is the itemcount. Note that if you increase the number of
+ * items in the set, we can compute a new zeta incrementally, so it should be fast unless you have
+ * added millions of items. However, if you decrease the number of items, we recompute zeta from
+ * scratch, so this can take a long time.
+ *
+ * <p>The algorithm used here is from "Quickly Generating Billion-Record Synthetic Databases", Jim
+ * Gray et al, SIGMOD 1994.
+ */
+public class ZipfianGenerator implements ValueGenerator {
+  public static final double ZIPFIAN_CONSTANT = 0.99;
+
+  /** Number of items. */
+  private final long items;
+
+  /** Min item to generate. */
+  private final long base;
+
+  /** The zipfian constant to use. */
+  private final double zipfianconstant;
+
+  /** Computed parameters for generating the distribution. */
+  private double alpha, zetan, eta, theta, zeta2theta;
+
+  /** The number of items used to compute zetan the last time. */
+  private long countforzeta;
+
+  /**
+   * Flag to prevent problems. If you increase the number of items the zipfian generator is allowed
+   * to choose from, this code will incrementally compute a new zeta value for the larger itemcount.
+   * However, if you decrease the number of items, the code computes zeta from scratch; this is
+   * expensive for large itemsets. Usually this is not intentional; e.g. one thread thinks the
+   * number of items is 1001 and calls "nextLong()" with that item count; then another thread who
+   * thinks the number of items is 1000 calls nextLong() with itemcount=1000 triggering the
+   * expensive recomputation. (It is expensive for 100 million items, not really for 1000 items.)
+   * Why did the second thread think there were only 1000 items? maybe it read the item count before
+   * the first thread incremented it. So this flag allows you to say if you really do want that
+   * recomputation. If true, then the code will recompute zeta if the itemcount goes down. If false,
+   * the code will assume itemcount only goes up, and never recompute.
+   */
+  private boolean allowitemcountdecrease = false;
+
+  private long lastValue;
+
+  /******************************* Constructors **************************************/
+
+  /**
+   * Create a zipfian generator for the specified number of items.
+   *
+   * @param items The number of items in the distribution.
+   */
+  public ZipfianGenerator(long items, SplittableRandom random) {
+    this(0, items - 1, random);
+  }
+
+  /**
+   * Create a zipfian generator for items between min and max.
+   *
+   * @param min The smallest integer to generate in the sequence.
+   * @param max The largest integer to generate in the sequence.
+   */
+  public ZipfianGenerator(long min, long max, SplittableRandom random) {
+    this(min, max, ZIPFIAN_CONSTANT, random);
+  }
+
+  /**
+   * Create a zipfian generator for the specified number of items using the specified zipfian
+   * constant.
+   *
+   * @param items The number of items in the distribution.
+   * @param zipfianconstant The zipfian constant to use.
+   */
+  public ZipfianGenerator(long items, double zipfianconstant, SplittableRandom random) {
+    this(0, items - 1, zipfianconstant, random);
+  }
+
+  /**
+   * Create a zipfian generator for items between min and max (inclusive) for the specified zipfian
+   * constant.
+   *
+   * @param min The smallest integer to generate in the sequence.
+   * @param max The largest integer to generate in the sequence.
+   * @param zipfianconstant The zipfian constant to use.
+   */
+  public ZipfianGenerator(long min, long max, double zipfianconstant, SplittableRandom random) {
+    this(min, max, zipfianconstant, zetastatic(max - min + 1, zipfianconstant), random);
+  }
+
+  /**
+   * Create a zipfian generator for items between min and max (inclusive) for the specified zipfian
+   * constant, using the precomputed value of zeta.
+   *
+   * @param min The smallest integer to generate in the sequence.
+   * @param max The largest integer to generate in the sequence.
+   * @param zipfianconstant The zipfian constant to use.
+   * @param zetan The precomputed zeta constant.
+   */
+  public ZipfianGenerator(
+      long min, long max, double zipfianconstant, double zetan, SplittableRandom random) {
+
+    items = max - min + 1;
+    base = min;
+    this.zipfianconstant = zipfianconstant;
+
+    theta = this.zipfianconstant;
+
+    zeta2theta = zeta(2, theta);
+
+    alpha = 1.0 / (1.0 - theta);
+    this.zetan = zetan;
+    countforzeta = items;
+    eta = (1 - Math.pow(2.0 / items, 1 - theta)) / (1 - zeta2theta / this.zetan);
+
+    getNextValue(random);
+  }
+
+  /**************************************************************************/
+
+  /**
+   * Compute the zeta constant needed for the distribution. Do this from scratch for a distribution
+   * with n items, using the zipfian constant thetaVal. Remember the value of n, so if we change the
+   * itemcount, we can recompute zeta.
+   *
+   * @param n The number of items to compute zeta over.
+   * @param thetaVal The zipfian constant.
+   */
+  double zeta(long n, double thetaVal) {
+    countforzeta = n;
+    return zetastatic(n, thetaVal);
+  }
+
+  /**
+   * Compute the zeta constant needed for the distribution. Do this from scratch for a distribution
+   * with n items, using the zipfian constant theta. This is a static version of the function which
+   * will not remember n.
+   *
+   * @param n The number of items to compute zeta over.
+   * @param theta The zipfian constant.
+   */
+  static double zetastatic(long n, double theta) {
+    return zetastatic(0, n, theta, 0);
+  }
+
+  /**
+   * Compute the zeta constant needed for the distribution. Do this incrementally for a distribution
+   * that has n items now but used to have st items. Use the zipfian constant thetaVal. Remember the
+   * new value of n so that if we change the itemcount, we'll know to recompute zeta.
+   *
+   * @param st The number of items used to compute the last initialsum
+   * @param n The number of items to compute zeta over.
+   * @param thetaVal The zipfian constant.
+   * @param initialsum The value of zeta we are computing incrementally from.
+   */
+  double zeta(long st, long n, double thetaVal, double initialsum) {
+    countforzeta = n;

Review comment:
       *THREAD_SAFETY_VIOLATION:*  Unprotected write. Non-private method `ZipfianGenerator.zeta(...)` writes to field `this.countforzeta` outside of synchronization.
    Reporting because another access to the same memory occurs on a background thread, although this access may not.
   (at-me [in a reply](https://help.sonatype.com/lift) with `help` or `ignore`)




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@solr.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@solr.apache.org
For additional commands, e-mail: issues-help@solr.apache.org