You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by vi...@apache.org on 2023/01/18 21:20:32 UTC
[lucene] branch main updated: Remove UTF8TaxonomyWriterCache (#12092)
This is an automated email from the ASF dual-hosted git repository.
vigyasharma pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new dc33ade76d3 Remove UTF8TaxonomyWriterCache (#12092)
dc33ade76d3 is described below
commit dc33ade76d36834ffe7b862f966ca31414712490
Author: Vigya Sharma <vi...@gmail.com>
AuthorDate: Wed Jan 18 13:20:26 2023 -0800
Remove UTF8TaxonomyWriterCache (#12092)
Removes the never-evicting UTF8TaxonomyWriterCache, changing the default to LruTaxonomyWriterCache
---
lucene/CHANGES.txt | 3 +
.../directory/DirectoryTaxonomyWriter.java | 13 +-
.../writercache/UTF8TaxonomyWriterCache.java | 174 ---------------------
.../directory/TestConcurrentFacetedIndexing.java | 5 +-
.../directory/TestDirectoryTaxonomyWriter.java | 10 +-
.../writercache/TestUTF8TaxonomyWriterCache.java | 127 ---------------
6 files changed, 15 insertions(+), 317 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 75cacb48c81..7319f0a48a9 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -8,6 +8,9 @@ http://s.apache.org/luceneversions
API Changes
---------------------
+* LUCENE-12092: Remove deprecated UTF8TaxonomyWriterCache. Please use LruTaxonomyWriterCache
+ instead. (Vigya Sharma)
+
* LUCENE-10010: AutomatonQuery, CompiledAutomaton, RunAutomaton, RegExp
classes no longer determinize NFAs. Instead it is the responsibility
of the caller to determinize. (Robert Muir)
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java
index d7c75e9d91b..97ec1c57529 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java
@@ -38,7 +38,6 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.facet.taxonomy.writercache.LruTaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
-import org.apache.lucene.facet.taxonomy.writercache.UTF8TaxonomyWriterCache;
import org.apache.lucene.index.CorruptIndexException; // javadocs
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
@@ -85,6 +84,8 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
*/
public static final String INDEX_EPOCH = "index.epoch";
+ private static final int DEFAULT_CACHE_SIZE = 4000;
+
private final Directory dir;
private final IndexWriter indexWriter;
private final TaxonomyWriterCache cache;
@@ -128,9 +129,8 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
* APPEND_OR_CREATE</code> appends to an existing index if there is one, otherwise it creates
* a new index.
* @param cache A {@link TaxonomyWriterCache} implementation which determines the in-memory
- * caching policy. See for example {@link LruTaxonomyWriterCache} and {@link
- * UTF8TaxonomyWriterCache}. If null or missing, {@link #defaultTaxonomyWriterCache()} is
- * used.
+ * caching policy. See for example {@link LruTaxonomyWriterCache}. If null or missing, {@link
+ * #defaultTaxonomyWriterCache()} is used.
* @throws CorruptIndexException if the taxonomy is corrupted.
* @throws LockObtainFailedException if the taxonomy is locked by another writer.
* @throws IOException if another error occurred.
@@ -267,11 +267,10 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
* Defines the default {@link TaxonomyWriterCache} to use in constructors which do not specify
* one.
*
- * <p>The current default is {@link UTF8TaxonomyWriterCache}, i.e., the entire taxonomy is cached
- * in memory while building it.
+ * <p>The current default is {@link LruTaxonomyWriterCache}
*/
public static TaxonomyWriterCache defaultTaxonomyWriterCache() {
- return new UTF8TaxonomyWriterCache();
+ return new LruTaxonomyWriterCache(DEFAULT_CACHE_SIZE);
}
/** Create this with {@code OpenMode.CREATE_OR_APPEND}. */
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/UTF8TaxonomyWriterCache.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/UTF8TaxonomyWriterCache.java
deleted file mode 100644
index 9d3c3d61970..00000000000
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/UTF8TaxonomyWriterCache.java
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.facet.taxonomy.writercache;
-
-import org.apache.lucene.facet.taxonomy.FacetLabel;
-import org.apache.lucene.util.Accountable;
-import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.ByteBlockPool;
-import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefBuilder;
-import org.apache.lucene.util.BytesRefHash;
-import org.apache.lucene.util.Counter;
-import org.apache.lucene.util.RamUsageEstimator;
-import org.apache.lucene.util.UnicodeUtil;
-
-/** A "cache" that never frees memory, and stores labels in a BytesRefHash (utf-8 encoding). */
-public final class UTF8TaxonomyWriterCache implements TaxonomyWriterCache, Accountable {
- private final ThreadLocal<BytesRefBuilder> bytes =
- new ThreadLocal<BytesRefBuilder>() {
- @Override
- protected BytesRefBuilder initialValue() {
- return new BytesRefBuilder();
- }
- };
-
- private final Counter bytesUsed = Counter.newCounter();
- private final BytesRefHash map =
- new BytesRefHash(new ByteBlockPool(new DirectTrackingAllocator(bytesUsed)));
-
- private static final int PAGE_BITS = 16;
- private static final int PAGE_SIZE = 1 << PAGE_BITS;
- private static final int PAGE_MASK = PAGE_SIZE - 1;
-
- private volatile int[][] ordinals;
-
- // How many labels we are storing:
- private int count;
-
- // How many pages in ordinals we've allocated:
- private int pageCount;
-
- /** Sole constructor. */
- public UTF8TaxonomyWriterCache() {
- ordinals = new int[1][];
- ordinals[0] = new int[PAGE_SIZE];
- }
-
- @Override
- public int get(FacetLabel label) {
- BytesRef bytes = toBytes(label);
- int id;
- synchronized (this) {
- id = map.find(bytes);
- }
- if (id == -1) {
- return LabelToOrdinal.INVALID_ORDINAL;
- }
- int page = id >>> PAGE_BITS;
- int offset = id & PAGE_MASK;
- return ordinals[page][offset];
- }
-
- // Called only from assert
- private boolean assertSameOrdinal(FacetLabel label, int id, int ord) {
- id = -id - 1;
- int page = id >>> PAGE_BITS;
- int offset = id & PAGE_MASK;
- int oldOrd = ordinals[page][offset];
- if (oldOrd != ord) {
- throw new IllegalArgumentException(
- "label "
- + label
- + " was already cached, with old ord="
- + oldOrd
- + " versus new ord="
- + ord);
- }
- return true;
- }
-
- @Override
- public boolean put(FacetLabel label, int ord) {
- BytesRef bytes = toBytes(label);
- int id;
- synchronized (this) {
- id = map.add(bytes);
- if (id < 0) {
- assert assertSameOrdinal(label, id, ord);
- return false;
- }
- assert id == count;
- int page = id >>> PAGE_BITS;
- int offset = id & PAGE_MASK;
- if (page == pageCount) {
- if (page == ordinals.length) {
- int[][] newOrdinals =
- new int[ArrayUtil.oversize(page + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)][];
- System.arraycopy(ordinals, 0, newOrdinals, 0, ordinals.length);
- ordinals = newOrdinals;
- }
- ordinals[page] = new int[PAGE_SIZE];
- pageCount++;
- }
- ordinals[page][offset] = ord;
- count++;
-
- // we never prune from the cache
- return false;
- }
- }
-
- @Override
- public boolean isFull() {
- // we are never full
- return false;
- }
-
- @Override
- public synchronized void clear() {
- map.clear();
- map.reinit();
- ordinals = new int[1][];
- ordinals[0] = new int[PAGE_SIZE];
- count = 0;
- pageCount = 0;
- assert bytesUsed.get() == 0;
- }
-
- /** How many labels are currently stored in the cache. */
- @Override
- public int size() {
- return count;
- }
-
- @Override
- public synchronized long ramBytesUsed() {
- return bytesUsed.get() + pageCount * (long) PAGE_SIZE * Integer.BYTES;
- }
-
- @Override
- public void close() {}
-
- private static final byte DELIM_CHAR = (byte) 0x1F;
-
- private BytesRef toBytes(FacetLabel label) {
- BytesRefBuilder bytes = this.bytes.get();
- bytes.clear();
- for (int i = 0; i < label.length; i++) {
- String part = label.components[i];
- if (i > 0) {
- bytes.append(DELIM_CHAR);
- }
- bytes.grow(bytes.length() + UnicodeUtil.maxUTF8Length(part.length()));
- bytes.setLength(
- UnicodeUtil.UTF16toUTF8(part, 0, part.length(), bytes.bytes(), bytes.length()));
- }
- return bytes.get();
- }
-}
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestConcurrentFacetedIndexing.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestConcurrentFacetedIndexing.java
index c7909001fdb..e3aab92be83 100644
--- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestConcurrentFacetedIndexing.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestConcurrentFacetedIndexing.java
@@ -27,7 +27,6 @@ import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.taxonomy.FacetLabel;
import org.apache.lucene.facet.taxonomy.writercache.LruTaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
-import org.apache.lucene.facet.taxonomy.writercache.UTF8TaxonomyWriterCache;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
@@ -79,8 +78,8 @@ public class TestConcurrentFacetedIndexing extends FacetTestCase {
static TaxonomyWriterCache newTaxoWriterCache(int ndocs) {
final double d = random().nextDouble();
if (d < 0.7) {
- // this is the fastest, yet most memory consuming
- return new UTF8TaxonomyWriterCache();
+ // same as LruTaxonomyWriterCache but with the default cache size
+ return DirectoryTaxonomyWriter.defaultTaxonomyWriterCache();
} else if (TEST_NIGHTLY && d > 0.98) {
// this is the slowest, but tests the writer concurrency when no caching is done.
// only pick it during NIGHTLY tests, and even then, with very low chances.
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java
index 13115864eb9..eb098f08524 100644
--- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java
@@ -33,7 +33,6 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.MemoryOrdinalMap;
import org.apache.lucene.facet.taxonomy.writercache.LruTaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
-import org.apache.lucene.facet.taxonomy.writercache.UTF8TaxonomyWriterCache;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
@@ -170,7 +169,7 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase {
// Verifies that if rollback is called, DTW is closed.
Directory dir = newDirectory();
DirectoryTaxonomyWriter dtw = new DirectoryTaxonomyWriter(dir);
- assertTrue(dtw.getCache() instanceof UTF8TaxonomyWriterCache);
+ assertTrue(dtw.getCache() instanceof LruTaxonomyWriterCache);
dtw.addCategory(new FacetLabel("a"));
dtw.rollback();
// should not have succeeded to add a category following rollback.
@@ -300,8 +299,8 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase {
final double d = random().nextDouble();
final TaxonomyWriterCache cache;
if (d < 0.7) {
- // this is the fastest, yet most memory consuming
- cache = new UTF8TaxonomyWriterCache();
+ // same as LruTaxonomyWriterCache but with the default cache size
+ cache = DirectoryTaxonomyWriter.defaultTaxonomyWriterCache();
} else if (TEST_NIGHTLY && d > 0.98) {
// this is the slowest, but tests the writer concurrency when no caching is done.
// only pick it during NIGHTLY tests, and even then, with very low chances.
@@ -506,8 +505,7 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase {
Directory indexDir = newDirectory(), taxoDir = newDirectory();
IndexWriter indexWriter =
new IndexWriter(indexDir, newIndexWriterConfig(new MockAnalyzer(random())));
- DirectoryTaxonomyWriter taxoWriter =
- new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE, new UTF8TaxonomyWriterCache());
+ DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE);
FacetsConfig config = new FacetsConfig();
// Add one huge label:
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestUTF8TaxonomyWriterCache.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestUTF8TaxonomyWriterCache.java
deleted file mode 100644
index 321d0a5098b..00000000000
--- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestUTF8TaxonomyWriterCache.java
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.facet.taxonomy.writercache;
-
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Random;
-import java.util.Set;
-import org.apache.lucene.facet.FacetTestCase;
-import org.apache.lucene.facet.taxonomy.FacetLabel;
-import org.apache.lucene.tests.util.TestUtil;
-
-public class TestUTF8TaxonomyWriterCache extends FacetTestCase {
- public void testPageOverflow() throws Exception {
- UTF8TaxonomyWriterCache cache = new UTF8TaxonomyWriterCache();
- for (int ord = 0; ord < 65536 * 2; ord++) {
- cache.put(new FacetLabel("foo:", Integer.toString(ord)), ord);
- }
-
- for (int ord = 0; ord < 65536 * 2; ord++) {
- assertEquals(ord, cache.get(new FacetLabel("foo:", Integer.toString(ord))));
- }
- }
-
- public void testRandom() throws Exception {
- LabelToOrdinal map = new LabelToOrdinalMap();
-
- UTF8TaxonomyWriterCache cache = new UTF8TaxonomyWriterCache();
-
- final int n = atLeast(10 * 1000);
- final int numUniqueValues = 50 * 1000;
-
- Random random = random();
- Set<String> uniqueValuesSet = new HashSet<>();
- while (uniqueValuesSet.size() < numUniqueValues) {
- int numParts = TestUtil.nextInt(random(), 1, 5);
- StringBuilder b = new StringBuilder();
- for (int i = 0; i < numParts; i++) {
- String part = null;
- while (true) {
- part = TestUtil.randomRealisticUnicodeString(random(), 16);
- part = part.replace("/", "");
- if (part.length() > 0) {
- break;
- }
- }
-
- if (i > 0) {
- b.append('/');
- }
- b.append(part);
- }
- uniqueValuesSet.add(b.toString());
- }
- String[] uniqueValues = uniqueValuesSet.toArray(new String[0]);
-
- int ordUpto = 0;
- for (int i = 0; i < n; i++) {
-
- int index = random.nextInt(numUniqueValues);
- FacetLabel label;
- String s = uniqueValues[index];
- if (s.length() == 0) {
- label = new FacetLabel();
- } else {
- label = new FacetLabel(s.split("/"));
- }
-
- int ord1 = map.getOrdinal(label);
- int ord2 = cache.get(label);
-
- assertEquals(ord1, ord2);
-
- if (ord1 == LabelToOrdinal.INVALID_ORDINAL) {
- ord1 = ordUpto++;
- map.addLabel(label, ord1);
- cache.put(label, ord1);
- }
- }
-
- for (int i = 0; i < numUniqueValues; i++) {
- FacetLabel label;
- String s = uniqueValues[i];
- if (s.length() == 0) {
- label = new FacetLabel();
- } else {
- label = new FacetLabel(s.split("/"));
- }
- int ord1 = map.getOrdinal(label);
- int ord2 = cache.get(label);
- assertEquals(ord1, ord2);
- }
- }
-
- private static class LabelToOrdinalMap extends LabelToOrdinal {
- private Map<FacetLabel, Integer> map = new HashMap<>();
-
- LabelToOrdinalMap() {}
-
- @Override
- public void addLabel(FacetLabel label, int ordinal) {
- map.put(label, ordinal);
- }
-
- @Override
- public int getOrdinal(FacetLabel label) {
- Integer value = map.get(label);
- return (value != null) ? value.intValue() : LabelToOrdinal.INVALID_ORDINAL;
- }
- }
-}