You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by no...@apache.org on 2020/02/21 02:43:18 UTC
[lucene-solr] branch branch_8x updated: SOLR-14270 export command
to have an option to write to a zip file (#1266)
This is an automated email from the ASF dual-hosted git repository.
noble pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/branch_8x by this push:
new fb9024f SOLR-14270 export command to have an option to write to a zip file (#1266)
fb9024f is described below
commit fb9024f0e5651d5dd38915ee1a6c66bef8927e3b
Author: Noble Paul <no...@users.noreply.github.com>
AuthorDate: Fri Feb 21 13:41:50 2020 +1100
SOLR-14270 export command to have an option to write to a zip file (#1266)
---
solr/CHANGES.txt | 2 +
.../src/java/org/apache/solr/util/ExportTool.java | 46 +++++++++++++++-------
.../test/org/apache/solr/util/TestExportTool.java | 2 +-
.../solr-ref-guide/src/command-line-utilities.adoc | 25 ++++++++++++
4 files changed, 59 insertions(+), 16 deletions(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 6b7f585..9ef0ff9 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -71,6 +71,8 @@ Improvements
* SOLR-14194: Highlighting now works when the uniqueKey field is not stored but has docValues. And the original
highlighter can now highlight text fields from docValues. (Andrzej Wislowski, David Smiley)
+* SOLR-14270: export command to have an option to write to a zip file (noble)
+
Optimizations
---------------------
diff --git a/solr/core/src/java/org/apache/solr/util/ExportTool.java b/solr/core/src/java/org/apache/solr/util/ExportTool.java
index 4b30c8e..766d69e 100644
--- a/solr/core/src/java/org/apache/solr/util/ExportTool.java
+++ b/solr/core/src/java/org/apache/solr/util/ExportTool.java
@@ -41,11 +41,13 @@ import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
+import java.util.zip.GZIPOutputStream;
import com.google.common.collect.ImmutableSet;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
+import org.apache.lucene.util.SuppressForbidden;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.SolrServerException;
@@ -185,13 +187,24 @@ public class ExportTool extends SolrCLI.ToolBase {
info.exportDocs();
}
- interface DocsSink {
- default void start() throws IOException {
- }
+ static abstract class DocsSink {
+ Info info;
+ OutputStream fos;
+
+ abstract void start() throws IOException ;
- void accept(SolrDocument document) throws IOException, InterruptedException;
+ @SuppressForbidden(reason = "Command line tool prints out to console")
+ void accept(SolrDocument document) throws IOException {
+ long count = info.docsWritten.incrementAndGet();
+
+ if (count % 100000 == 0) {
+ System.out.println("\nDOCS: " + count);
+ }
- default void end() throws IOException {
+
+ }
+
+ void end() throws IOException {
}
}
@@ -228,13 +241,10 @@ public class ExportTool extends SolrCLI.ToolBase {
.create("fields")
};
- static class JsonSink implements DocsSink {
- private final Info info;
+ static class JsonSink extends DocsSink {
private CharArr charArr = new CharArr(1024 * 2);
JSONWriter jsonWriter = new JSONWriter(charArr, -1);
private Writer writer;
- private OutputStream fos;
- public AtomicLong docs = new AtomicLong();
public JsonSink(Info info) {
this.info = info;
@@ -243,6 +253,7 @@ public class ExportTool extends SolrCLI.ToolBase {
@Override
public void start() throws IOException {
fos = new FileOutputStream(info.out);
+ if(info.out.endsWith(".json.gz") || info.out.endsWith(".json.")) fos = new GZIPOutputStream(fos);
if (info.bufferSize > 0) {
fos = new BufferedOutputStream(fos, info.bufferSize);
}
@@ -259,7 +270,6 @@ public class ExportTool extends SolrCLI.ToolBase {
@Override
public synchronized void accept(SolrDocument doc) throws IOException {
- docs.incrementAndGet();
charArr.reset();
Map m = new LinkedHashMap(doc.size());
doc.forEach((s, field) -> {
@@ -274,13 +284,12 @@ public class ExportTool extends SolrCLI.ToolBase {
jsonWriter.write(m);
writer.write(charArr.getArray(), charArr.getStart(), charArr.getEnd());
writer.append('\n');
+ super.accept(doc);
}
}
- private static class JavabinSink implements DocsSink {
- private final Info info;
+ static class JavabinSink extends DocsSink {
JavaBinCodec codec;
- OutputStream fos;
public JavabinSink(Info info) {
this.info = info;
@@ -289,6 +298,7 @@ public class ExportTool extends SolrCLI.ToolBase {
@Override
public void start() throws IOException {
fos = new FileOutputStream(info.out);
+ if(info.out.endsWith(".json.gz") || info.out.endsWith(".json.")) fos = new GZIPOutputStream(fos);
if (info.bufferSize > 0) {
fos = new BufferedOutputStream(fos, info.bufferSize);
}
@@ -330,6 +340,7 @@ public class ExportTool extends SolrCLI.ToolBase {
codec.writeTag(SOLRINPUTDOC, sz);
codec.writeFloat(1f); // document boost
doc.forEach(bic);
+ super.accept(doc);
}
}
@@ -339,13 +350,17 @@ public class ExportTool extends SolrCLI.ToolBase {
SolrDocument EOFDOC = new SolrDocument();
volatile boolean failed = false;
Map<String, CoreHandler> corehandlers = new HashMap();
+ private long startTime ;
+ @SuppressForbidden(reason = "Need to print out time")
public MultiThreadedRunner(String url) {
super(url);
+ startTime= System.currentTimeMillis();
}
@Override
+ @SuppressForbidden(reason = "Need to print out time")
void exportDocs() throws Exception {
sink = getSink();
fetchUniqueKey();
@@ -362,7 +377,7 @@ public class ExportTool extends SolrCLI.ToolBase {
addConsumer(consumerlatch);
addProducers(m);
if (output != null) {
- output.println("NO of shards : " + corehandlers.size());
+ output.println("NO: of shards : " + corehandlers.size());
}
CountDownLatch producerLatch = new CountDownLatch(corehandlers.size());
corehandlers.forEach((s, coreHandler) -> producerThreadpool.submit(() -> {
@@ -390,6 +405,8 @@ public class ExportTool extends SolrCLI.ToolBase {
//ignore
}
}
+ System.out.println("\nTotal Docs exported: "+ (docsWritten.get() -1)+
+ ". Time taken: "+( (System.currentTimeMillis() - startTime)/1000) + "secs");
}
}
@@ -418,7 +435,6 @@ public class ExportTool extends SolrCLI.ToolBase {
try {
if (docsWritten.get() > limit) continue;
sink.accept(doc);
- docsWritten.incrementAndGet();
} catch (Exception e) {
if (output != null) output.println("Failed to write to file " + e.getMessage());
failed = true;
diff --git a/solr/core/src/test/org/apache/solr/util/TestExportTool.java b/solr/core/src/test/org/apache/solr/util/TestExportTool.java
index 9e637f9..0351166 100644
--- a/solr/core/src/test/org/apache/solr/util/TestExportTool.java
+++ b/solr/core/src/test/org/apache/solr/util/TestExportTool.java
@@ -186,7 +186,7 @@ public class TestExportTool extends SolrCloudTestCase {
info.fields = "id,desc_s";
info.setLimit("-1");
info.exportDocs();
- long actual = ((ExportTool.JsonSink) info.sink).docs.get();
+ long actual = ((ExportTool.JsonSink) info.sink).info.docsWritten.get();
assertTrue("docs written :" + actual + "docs produced : " + info.docsWritten.get(), actual >= docCount);
assertJsonDocsCount(info, docCount);
} finally {
diff --git a/solr/solr-ref-guide/src/command-line-utilities.adoc b/solr/solr-ref-guide/src/command-line-utilities.adoc
index 83be372..392256c 100644
--- a/solr/solr-ref-guide/src/command-line-utilities.adoc
+++ b/solr/solr-ref-guide/src/command-line-utilities.adoc
@@ -154,3 +154,28 @@ Unlike the CLUSTERPROP command on the <<cluster-node-management.adoc#clusterprop
----
./server/scripts/cloud-scripts/zkcli.sh -zkhost 127.0.0.1:2181 -cmd clusterprop -name urlScheme -val https
----
+
+=== Export data from a collection to a file
+
+This command downloads documents from all shards in parallel and write the documents to a single file. The supported format are `jsonl` and `javabin`
+
+Arguments are:
+
+ `-url` :: (Requred parameter) Url of the collection
+ `-out` :: (Optional) Name of the file to write to. default file name is `<collection-name>.json` . If the file name ends with `.json.gz` , the output is a zip file of json
+ `-format` :: (Optional) Supported values are json/javabin
+ `-limit` :: (Optional) No:of docs to export. By default the entire collection is exported
+ `-fields` :: (Optional) Fields to be exported. By default, all fields are exported
+
+example 1: Export all documents in a collection `gettingstarted` into a file called `gettingstarted.json`
+[source,bash]
+----
+bin/solr export -url http://localhost:8983/solr/gettingstarted
+----
+
+example 2: export 1M docs of collection `gettingstarted` into a file called `1MDocs.json.gz` as a zipped json file
+[source,bash]
+----
+bin/solr export -url http://localhost:8983/solr/gettingstarted -out 1MDocs.json.gz
+----
+