You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by vi...@apache.org on 2023/01/18 21:20:32 UTC

[lucene] branch main updated: Remove UTF8TaxonomyWriterCache (#12092)

This is an automated email from the ASF dual-hosted git repository.

vigyasharma pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new dc33ade76d3 Remove UTF8TaxonomyWriterCache (#12092)
dc33ade76d3 is described below

commit dc33ade76d36834ffe7b862f966ca31414712490
Author: Vigya Sharma <vi...@gmail.com>
AuthorDate: Wed Jan 18 13:20:26 2023 -0800

    Remove UTF8TaxonomyWriterCache (#12092)
    
    Removes the never-evicting UTF8TaxonomyWriterCache, changing the default to LruTaxonomyWriterCache
---
 lucene/CHANGES.txt                                 |   3 +
 .../directory/DirectoryTaxonomyWriter.java         |  13 +-
 .../writercache/UTF8TaxonomyWriterCache.java       | 174 ---------------------
 .../directory/TestConcurrentFacetedIndexing.java   |   5 +-
 .../directory/TestDirectoryTaxonomyWriter.java     |  10 +-
 .../writercache/TestUTF8TaxonomyWriterCache.java   | 127 ---------------
 6 files changed, 15 insertions(+), 317 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 75cacb48c81..7319f0a48a9 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -8,6 +8,9 @@ http://s.apache.org/luceneversions
 API Changes
 ---------------------
 
+* LUCENE-12092: Remove deprecated UTF8TaxonomyWriterCache. Please use LruTaxonomyWriterCache
+  instead. (Vigya Sharma)
+
 * LUCENE-10010: AutomatonQuery, CompiledAutomaton, RunAutomaton, RegExp
   classes no longer determinize NFAs. Instead it is the responsibility
   of the caller to determinize.  (Robert Muir)
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java
index d7c75e9d91b..97ec1c57529 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java
@@ -38,7 +38,6 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
 import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
 import org.apache.lucene.facet.taxonomy.writercache.LruTaxonomyWriterCache;
 import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
-import org.apache.lucene.facet.taxonomy.writercache.UTF8TaxonomyWriterCache;
 import org.apache.lucene.index.CorruptIndexException; // javadocs
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
@@ -85,6 +84,8 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
    */
   public static final String INDEX_EPOCH = "index.epoch";
 
+  private static final int DEFAULT_CACHE_SIZE = 4000;
+
   private final Directory dir;
   private final IndexWriter indexWriter;
   private final TaxonomyWriterCache cache;
@@ -128,9 +129,8 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
    *     APPEND_OR_CREATE</code> appends to an existing index if there is one, otherwise it creates
    *     a new index.
    * @param cache A {@link TaxonomyWriterCache} implementation which determines the in-memory
-   *     caching policy. See for example {@link LruTaxonomyWriterCache} and {@link
-   *     UTF8TaxonomyWriterCache}. If null or missing, {@link #defaultTaxonomyWriterCache()} is
-   *     used.
+   *     caching policy. See for example {@link LruTaxonomyWriterCache}. If null or missing, {@link
+   *     #defaultTaxonomyWriterCache()} is used.
    * @throws CorruptIndexException if the taxonomy is corrupted.
    * @throws LockObtainFailedException if the taxonomy is locked by another writer.
    * @throws IOException if another error occurred.
@@ -267,11 +267,10 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
    * Defines the default {@link TaxonomyWriterCache} to use in constructors which do not specify
    * one.
    *
-   * <p>The current default is {@link UTF8TaxonomyWriterCache}, i.e., the entire taxonomy is cached
-   * in memory while building it.
+   * <p>The current default is {@link LruTaxonomyWriterCache}
    */
   public static TaxonomyWriterCache defaultTaxonomyWriterCache() {
-    return new UTF8TaxonomyWriterCache();
+    return new LruTaxonomyWriterCache(DEFAULT_CACHE_SIZE);
   }
 
   /** Create this with {@code OpenMode.CREATE_OR_APPEND}. */
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/UTF8TaxonomyWriterCache.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/UTF8TaxonomyWriterCache.java
deleted file mode 100644
index 9d3c3d61970..00000000000
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/UTF8TaxonomyWriterCache.java
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.facet.taxonomy.writercache;
-
-import org.apache.lucene.facet.taxonomy.FacetLabel;
-import org.apache.lucene.util.Accountable;
-import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.ByteBlockPool;
-import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefBuilder;
-import org.apache.lucene.util.BytesRefHash;
-import org.apache.lucene.util.Counter;
-import org.apache.lucene.util.RamUsageEstimator;
-import org.apache.lucene.util.UnicodeUtil;
-
-/** A "cache" that never frees memory, and stores labels in a BytesRefHash (utf-8 encoding). */
-public final class UTF8TaxonomyWriterCache implements TaxonomyWriterCache, Accountable {
-  private final ThreadLocal<BytesRefBuilder> bytes =
-      new ThreadLocal<BytesRefBuilder>() {
-        @Override
-        protected BytesRefBuilder initialValue() {
-          return new BytesRefBuilder();
-        }
-      };
-
-  private final Counter bytesUsed = Counter.newCounter();
-  private final BytesRefHash map =
-      new BytesRefHash(new ByteBlockPool(new DirectTrackingAllocator(bytesUsed)));
-
-  private static final int PAGE_BITS = 16;
-  private static final int PAGE_SIZE = 1 << PAGE_BITS;
-  private static final int PAGE_MASK = PAGE_SIZE - 1;
-
-  private volatile int[][] ordinals;
-
-  // How many labels we are storing:
-  private int count;
-
-  // How many pages in ordinals we've allocated:
-  private int pageCount;
-
-  /** Sole constructor. */
-  public UTF8TaxonomyWriterCache() {
-    ordinals = new int[1][];
-    ordinals[0] = new int[PAGE_SIZE];
-  }
-
-  @Override
-  public int get(FacetLabel label) {
-    BytesRef bytes = toBytes(label);
-    int id;
-    synchronized (this) {
-      id = map.find(bytes);
-    }
-    if (id == -1) {
-      return LabelToOrdinal.INVALID_ORDINAL;
-    }
-    int page = id >>> PAGE_BITS;
-    int offset = id & PAGE_MASK;
-    return ordinals[page][offset];
-  }
-
-  // Called only from assert
-  private boolean assertSameOrdinal(FacetLabel label, int id, int ord) {
-    id = -id - 1;
-    int page = id >>> PAGE_BITS;
-    int offset = id & PAGE_MASK;
-    int oldOrd = ordinals[page][offset];
-    if (oldOrd != ord) {
-      throw new IllegalArgumentException(
-          "label "
-              + label
-              + " was already cached, with old ord="
-              + oldOrd
-              + " versus new ord="
-              + ord);
-    }
-    return true;
-  }
-
-  @Override
-  public boolean put(FacetLabel label, int ord) {
-    BytesRef bytes = toBytes(label);
-    int id;
-    synchronized (this) {
-      id = map.add(bytes);
-      if (id < 0) {
-        assert assertSameOrdinal(label, id, ord);
-        return false;
-      }
-      assert id == count;
-      int page = id >>> PAGE_BITS;
-      int offset = id & PAGE_MASK;
-      if (page == pageCount) {
-        if (page == ordinals.length) {
-          int[][] newOrdinals =
-              new int[ArrayUtil.oversize(page + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)][];
-          System.arraycopy(ordinals, 0, newOrdinals, 0, ordinals.length);
-          ordinals = newOrdinals;
-        }
-        ordinals[page] = new int[PAGE_SIZE];
-        pageCount++;
-      }
-      ordinals[page][offset] = ord;
-      count++;
-
-      // we never prune from the cache
-      return false;
-    }
-  }
-
-  @Override
-  public boolean isFull() {
-    // we are never full
-    return false;
-  }
-
-  @Override
-  public synchronized void clear() {
-    map.clear();
-    map.reinit();
-    ordinals = new int[1][];
-    ordinals[0] = new int[PAGE_SIZE];
-    count = 0;
-    pageCount = 0;
-    assert bytesUsed.get() == 0;
-  }
-
-  /** How many labels are currently stored in the cache. */
-  @Override
-  public int size() {
-    return count;
-  }
-
-  @Override
-  public synchronized long ramBytesUsed() {
-    return bytesUsed.get() + pageCount * (long) PAGE_SIZE * Integer.BYTES;
-  }
-
-  @Override
-  public void close() {}
-
-  private static final byte DELIM_CHAR = (byte) 0x1F;
-
-  private BytesRef toBytes(FacetLabel label) {
-    BytesRefBuilder bytes = this.bytes.get();
-    bytes.clear();
-    for (int i = 0; i < label.length; i++) {
-      String part = label.components[i];
-      if (i > 0) {
-        bytes.append(DELIM_CHAR);
-      }
-      bytes.grow(bytes.length() + UnicodeUtil.maxUTF8Length(part.length()));
-      bytes.setLength(
-          UnicodeUtil.UTF16toUTF8(part, 0, part.length(), bytes.bytes(), bytes.length()));
-    }
-    return bytes.get();
-  }
-}
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestConcurrentFacetedIndexing.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestConcurrentFacetedIndexing.java
index c7909001fdb..e3aab92be83 100644
--- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestConcurrentFacetedIndexing.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestConcurrentFacetedIndexing.java
@@ -27,7 +27,6 @@ import org.apache.lucene.facet.FacetsConfig;
 import org.apache.lucene.facet.taxonomy.FacetLabel;
 import org.apache.lucene.facet.taxonomy.writercache.LruTaxonomyWriterCache;
 import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
-import org.apache.lucene.facet.taxonomy.writercache.UTF8TaxonomyWriterCache;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 import org.apache.lucene.store.Directory;
@@ -79,8 +78,8 @@ public class TestConcurrentFacetedIndexing extends FacetTestCase {
   static TaxonomyWriterCache newTaxoWriterCache(int ndocs) {
     final double d = random().nextDouble();
     if (d < 0.7) {
-      // this is the fastest, yet most memory consuming
-      return new UTF8TaxonomyWriterCache();
+      // same as LruTaxonomyWriterCache but with the default cache size
+      return DirectoryTaxonomyWriter.defaultTaxonomyWriterCache();
     } else if (TEST_NIGHTLY && d > 0.98) {
       // this is the slowest, but tests the writer concurrency when no caching is done.
       // only pick it during NIGHTLY tests, and even then, with very low chances.
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java
index 13115864eb9..eb098f08524 100644
--- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java
@@ -33,7 +33,6 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
 import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.MemoryOrdinalMap;
 import org.apache.lucene.facet.taxonomy.writercache.LruTaxonomyWriterCache;
 import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
-import org.apache.lucene.facet.taxonomy.writercache.UTF8TaxonomyWriterCache;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
@@ -170,7 +169,7 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase {
     // Verifies that if rollback is called, DTW is closed.
     Directory dir = newDirectory();
     DirectoryTaxonomyWriter dtw = new DirectoryTaxonomyWriter(dir);
-    assertTrue(dtw.getCache() instanceof UTF8TaxonomyWriterCache);
+    assertTrue(dtw.getCache() instanceof LruTaxonomyWriterCache);
     dtw.addCategory(new FacetLabel("a"));
     dtw.rollback();
     // should not have succeeded to add a category following rollback.
@@ -300,8 +299,8 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase {
     final double d = random().nextDouble();
     final TaxonomyWriterCache cache;
     if (d < 0.7) {
-      // this is the fastest, yet most memory consuming
-      cache = new UTF8TaxonomyWriterCache();
+      // same as LruTaxonomyWriterCache but with the default cache size
+      cache = DirectoryTaxonomyWriter.defaultTaxonomyWriterCache();
     } else if (TEST_NIGHTLY && d > 0.98) {
       // this is the slowest, but tests the writer concurrency when no caching is done.
       // only pick it during NIGHTLY tests, and even then, with very low chances.
@@ -506,8 +505,7 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase {
     Directory indexDir = newDirectory(), taxoDir = newDirectory();
     IndexWriter indexWriter =
         new IndexWriter(indexDir, newIndexWriterConfig(new MockAnalyzer(random())));
-    DirectoryTaxonomyWriter taxoWriter =
-        new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE, new UTF8TaxonomyWriterCache());
+    DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE);
     FacetsConfig config = new FacetsConfig();
 
     // Add one huge label:
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestUTF8TaxonomyWriterCache.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestUTF8TaxonomyWriterCache.java
deleted file mode 100644
index 321d0a5098b..00000000000
--- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestUTF8TaxonomyWriterCache.java
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.facet.taxonomy.writercache;
-
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Random;
-import java.util.Set;
-import org.apache.lucene.facet.FacetTestCase;
-import org.apache.lucene.facet.taxonomy.FacetLabel;
-import org.apache.lucene.tests.util.TestUtil;
-
-public class TestUTF8TaxonomyWriterCache extends FacetTestCase {
-  public void testPageOverflow() throws Exception {
-    UTF8TaxonomyWriterCache cache = new UTF8TaxonomyWriterCache();
-    for (int ord = 0; ord < 65536 * 2; ord++) {
-      cache.put(new FacetLabel("foo:", Integer.toString(ord)), ord);
-    }
-
-    for (int ord = 0; ord < 65536 * 2; ord++) {
-      assertEquals(ord, cache.get(new FacetLabel("foo:", Integer.toString(ord))));
-    }
-  }
-
-  public void testRandom() throws Exception {
-    LabelToOrdinal map = new LabelToOrdinalMap();
-
-    UTF8TaxonomyWriterCache cache = new UTF8TaxonomyWriterCache();
-
-    final int n = atLeast(10 * 1000);
-    final int numUniqueValues = 50 * 1000;
-
-    Random random = random();
-    Set<String> uniqueValuesSet = new HashSet<>();
-    while (uniqueValuesSet.size() < numUniqueValues) {
-      int numParts = TestUtil.nextInt(random(), 1, 5);
-      StringBuilder b = new StringBuilder();
-      for (int i = 0; i < numParts; i++) {
-        String part = null;
-        while (true) {
-          part = TestUtil.randomRealisticUnicodeString(random(), 16);
-          part = part.replace("/", "");
-          if (part.length() > 0) {
-            break;
-          }
-        }
-
-        if (i > 0) {
-          b.append('/');
-        }
-        b.append(part);
-      }
-      uniqueValuesSet.add(b.toString());
-    }
-    String[] uniqueValues = uniqueValuesSet.toArray(new String[0]);
-
-    int ordUpto = 0;
-    for (int i = 0; i < n; i++) {
-
-      int index = random.nextInt(numUniqueValues);
-      FacetLabel label;
-      String s = uniqueValues[index];
-      if (s.length() == 0) {
-        label = new FacetLabel();
-      } else {
-        label = new FacetLabel(s.split("/"));
-      }
-
-      int ord1 = map.getOrdinal(label);
-      int ord2 = cache.get(label);
-
-      assertEquals(ord1, ord2);
-
-      if (ord1 == LabelToOrdinal.INVALID_ORDINAL) {
-        ord1 = ordUpto++;
-        map.addLabel(label, ord1);
-        cache.put(label, ord1);
-      }
-    }
-
-    for (int i = 0; i < numUniqueValues; i++) {
-      FacetLabel label;
-      String s = uniqueValues[i];
-      if (s.length() == 0) {
-        label = new FacetLabel();
-      } else {
-        label = new FacetLabel(s.split("/"));
-      }
-      int ord1 = map.getOrdinal(label);
-      int ord2 = cache.get(label);
-      assertEquals(ord1, ord2);
-    }
-  }
-
-  private static class LabelToOrdinalMap extends LabelToOrdinal {
-    private Map<FacetLabel, Integer> map = new HashMap<>();
-
-    LabelToOrdinalMap() {}
-
-    @Override
-    public void addLabel(FacetLabel label, int ordinal) {
-      map.put(label, ordinal);
-    }
-
-    @Override
-    public int getOrdinal(FacetLabel label) {
-      Integer value = map.get(label);
-      return (value != null) ? value.intValue() : LabelToOrdinal.INVALID_ORDINAL;
-    }
-  }
-}