You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by ep...@apache.org on 2023/05/02 16:01:43 UTC
[solr] branch main updated: SOLR-16764: Be clearer that bin/solr export uses JSONL, not JSON format. Fix up how files are named and output and add tests. (#1614)
This is an automated email from the ASF dual-hosted git repository.
epugh pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/main by this push:
new 4ff4a311d59 SOLR-16764: Be clearer that bin/solr export uses JSONL, not JSON format. Fix up how files are named and output and add tests. (#1614)
4ff4a311d59 is described below
commit 4ff4a311d590f2619a321e2d537b1a406599ac7d
Author: Eric Pugh <ep...@opensourceconnections.com>
AuthorDate: Tue May 2 12:01:37 2023 -0400
SOLR-16764: Be clearer that bin/solr export uses JSONL, not JSON format. Fix up how files are named and output and add tests. (#1614)
* We are exporting with JSON with Lines, so lets do .jsonl
* Allow the -out parameter to specific a directory as well as a file, introduce a new seperate -compress switch
* document new params for export, and clean up formatting.
---
solr/CHANGES.txt | 3 ++
.../src/java/org/apache/solr/cli/ExportTool.java | 55 ++++++++++++++--------
.../test/org/apache/solr/cli/TestExportTool.java | 27 +++++------
solr/packaging/test/test_export.bats | 36 ++++++++++++--
.../pages/solr-control-script-reference.adoc | 50 +++++++++++---------
5 files changed, 112 insertions(+), 59 deletions(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index a6f8933b832..4ad05180a0c 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -123,6 +123,9 @@ Optimizations
* SOLR-16693: For query timeAllowed, switch from ExitableDirectoryReader to TimeLimitingBulkScorer (David Smiley)
+* SOLR-16764: Clarify that ExportTool exports documents in JSON with Lines format, not standard JSON. Add explicit -compress option for gzipping output.
+ Add ability to specific a directory for output along with a specific file when using -out. (Eric Pugh)
+
Bug Fixes
---------------------
diff --git a/solr/core/src/java/org/apache/solr/cli/ExportTool.java b/solr/core/src/java/org/apache/solr/cli/ExportTool.java
index f230c37e7e1..71c3ae1da80 100644
--- a/solr/core/src/java/org/apache/solr/cli/ExportTool.java
+++ b/solr/core/src/java/org/apache/solr/cli/ExportTool.java
@@ -33,6 +33,7 @@ import java.io.PrintStream;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
+import java.nio.file.Path;
import java.time.Instant;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
@@ -97,6 +98,7 @@ public class ExportTool extends ToolBase {
public abstract static class Info {
String baseurl;
String format;
+ boolean compress;
String query;
String coll;
String out;
@@ -111,7 +113,7 @@ public class ExportTool extends ToolBase {
public Info(String url) {
setUrl(url);
- setOutFormat(null, "jsonl");
+ setOutFormat(null, "jsonl", false);
}
public void setUrl(String url) {
@@ -126,21 +128,29 @@ public class ExportTool extends ToolBase {
if (limit == -1) limit = Long.MAX_VALUE;
}
- public void setOutFormat(String out, String format) {
+ public void setOutFormat(String out, String format, boolean compress) {
+ this.compress = compress;
this.format = format;
- if (format == null) format = "jsonl";
- if (!formats.contains(format)) {
- throw new IllegalArgumentException("format must be one of :" + formats);
- }
-
this.out = out;
+ if (this.format == null) {
+ this.format = "jsonl";
+ }
+ if (!formats.contains(this.format)) {
+ throw new IllegalArgumentException("format must be one of: " + formats);
+ }
if (this.out == null) {
- this.out = JAVABIN.equals(format) ? coll + ".javabin" : coll + ".json";
+ this.out = coll;
+ } else if (Files.isDirectory(Path.of(this.out))) {
+ this.out = this.out + "/" + coll;
+ }
+ this.out = JAVABIN.equals(this.format) ? this.out + ".javabin" : this.out + ".jsonl";
+ if (compress) {
+ this.out = this.out + ".gz";
}
}
DocsSink getSink() {
- return JAVABIN.equals(format) ? new JavabinSink(this) : new JsonSink(this);
+ return JAVABIN.equals(format) ? new JavabinSink(this) : new JsonWithLinesSink(this);
}
abstract void exportDocs() throws Exception;
@@ -180,7 +190,8 @@ public class ExportTool extends ToolBase {
String url = cli.getOptionValue("url");
Info info = new MultiThreadedRunner(url);
info.query = cli.getOptionValue("query", "*:*");
- info.setOutFormat(cli.getOptionValue("out"), cli.getOptionValue("format"));
+ info.setOutFormat(
+ cli.getOptionValue("out"), cli.getOptionValue("format"), cli.hasOption("compress"));
info.fields = cli.getOptionValue("fields");
info.setLimit(cli.getOptionValue("limit", "100"));
info.output = super.stdout;
@@ -215,14 +226,16 @@ public class ExportTool extends ToolBase {
Option.builder("out")
.hasArg()
.required(false)
- .desc("File name, defaults to 'collection-name.<format>'.")
+ .desc(
+ "Path to output the exported data, and optionally the file name, defaults to 'collection-name'.")
.build(),
Option.builder("format")
.hasArg()
.required(false)
.desc(
- "Output format for exported docs (json or javabin), defaulting to json. File extension would be .json.")
+ "Output format for exported docs (jsonl or javabin), defaulting to jsonl, appended to the output file.")
.build(),
+ Option.builder("compress").required(false).desc("Compress the output.").build(),
Option.builder("limit")
.hasArg()
.required(false)
@@ -239,20 +252,21 @@ public class ExportTool extends ToolBase {
.desc("Comma separated list of fields to export. By default all fields are fetched.")
.build());
- static class JsonSink extends DocsSink {
- private CharArr charArr = new CharArr(1024 * 2);
+ static class JsonWithLinesSink extends DocsSink {
+ private final CharArr charArr = new CharArr(1024 * 2);
JSONWriter jsonWriter = new JSONWriter(charArr, -1);
private Writer writer;
- public JsonSink(Info info) {
+ public JsonWithLinesSink(Info info) {
this.info = info;
}
@Override
public void start() throws IOException {
fos = new FileOutputStream(info.out);
- if (info.out.endsWith(".json.gz") || info.out.endsWith(".json."))
+ if (info.compress) {
fos = new GZIPOutputStream(fos);
+ }
if (info.bufferSize > 0) {
fos = new BufferedOutputStream(fos, info.bufferSize);
}
@@ -325,8 +339,9 @@ public class ExportTool extends ToolBase {
@Override
public void start() throws IOException {
fos = new FileOutputStream(info.out);
- if (info.out.endsWith(".json.gz") || info.out.endsWith(".json."))
+ if (info.compress) {
fos = new GZIPOutputStream(fos);
+ }
if (info.bufferSize > 0) {
fos = new BufferedOutputStream(fos, info.bufferSize);
}
@@ -346,7 +361,7 @@ public class ExportTool extends ToolBase {
fos.close();
}
- private BiConsumer<String, Object> bic =
+ private final BiConsumer<String, Object> bic =
new BiConsumer<>() {
@Override
public void accept(String s, Object o) {
@@ -378,7 +393,7 @@ public class ExportTool extends ToolBase {
SolrDocument EOFDOC = new SolrDocument();
volatile boolean failed = false;
Map<String, CoreHandler> corehandlers = new HashMap<>();
- private long startTime;
+ private final long startTime;
@SuppressForbidden(reason = "Need to print out time")
public MultiThreadedRunner(String url) {
@@ -460,7 +475,7 @@ public class ExportTool extends ToolBase {
consumerThreadpool.submit(
() -> {
while (true) {
- SolrDocument doc = null;
+ SolrDocument doc;
try {
doc = queue.poll(30, TimeUnit.SECONDS);
} catch (InterruptedException e) {
diff --git a/solr/core/src/test/org/apache/solr/cli/TestExportTool.java b/solr/core/src/test/org/apache/solr/cli/TestExportTool.java
index 82424998ff0..dd57cde022f 100644
--- a/solr/core/src/test/org/apache/solr/cli/TestExportTool.java
+++ b/solr/core/src/test/org/apache/solr/cli/TestExportTool.java
@@ -23,8 +23,8 @@ import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
-import java.util.Arrays;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import java.util.function.Predicate;
import org.apache.lucene.tests.util.TestUtil;
@@ -82,7 +82,7 @@ public class TestExportTool extends SolrCloudTestCase {
ExportTool.Info info = new ExportTool.MultiThreadedRunner(url);
String absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".json";
- info.setOutFormat(absolutePath, "jsonl");
+ info.setOutFormat(absolutePath, "jsonl", false);
info.setLimit("200");
info.fields = "id,desc_s,a_dt";
info.exportDocs();
@@ -91,7 +91,7 @@ public class TestExportTool extends SolrCloudTestCase {
info = new ExportTool.MultiThreadedRunner(url);
absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".json";
- info.setOutFormat(absolutePath, "jsonl");
+ info.setOutFormat(absolutePath, "jsonl", false);
info.setLimit("-1");
info.fields = "id,desc_s";
info.exportDocs();
@@ -100,7 +100,7 @@ public class TestExportTool extends SolrCloudTestCase {
info = new ExportTool.MultiThreadedRunner(url);
absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".javabin";
- info.setOutFormat(absolutePath, "javabin");
+ info.setOutFormat(absolutePath, "javabin", false);
info.setLimit("200");
info.fields = "id,desc_s";
info.exportDocs();
@@ -109,7 +109,7 @@ public class TestExportTool extends SolrCloudTestCase {
info = new ExportTool.MultiThreadedRunner(url);
absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".javabin";
- info.setOutFormat(absolutePath, "javabin");
+ info.setOutFormat(absolutePath, "javabin", false);
info.setLimit("-1");
info.fields = "id,desc_s";
info.exportDocs();
@@ -169,13 +169,13 @@ public class TestExportTool extends SolrCloudTestCase {
}
assertEquals(docCount, totalDocsFromCores);
- ExportTool.MultiThreadedRunner info = null;
- String absolutePath = null;
+ ExportTool.MultiThreadedRunner info;
+ String absolutePath;
info = new ExportTool.MultiThreadedRunner(url);
info.output = System.out;
absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".javabin";
- info.setOutFormat(absolutePath, "javabin");
+ info.setOutFormat(absolutePath, "javabin", false);
info.setLimit("-1");
info.exportDocs();
assertJavabinDocsCount(info, docCount);
@@ -186,11 +186,11 @@ public class TestExportTool extends SolrCloudTestCase {
info = new ExportTool.MultiThreadedRunner(url);
info.output = System.out;
absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".json";
- info.setOutFormat(absolutePath, "jsonl");
+ info.setOutFormat(absolutePath, "jsonl", false);
info.fields = "id,desc_s";
info.setLimit("-1");
info.exportDocs();
- long actual = ((ExportTool.JsonSink) info.sink).info.docsWritten.get();
+ long actual = info.sink.info.docsWritten.get();
assertTrue(
"docs written :" + actual + "docs produced : " + info.docsWritten.get(),
actual >= docCount);
@@ -203,8 +203,7 @@ public class TestExportTool extends SolrCloudTestCase {
private void assertJavabinDocsCount(ExportTool.Info info, int expected) throws IOException {
assertTrue(
"" + info.docsWritten.get() + " expected " + expected, info.docsWritten.get() >= expected);
- FileInputStream fis = new FileInputStream(info.out);
- try {
+ try (FileInputStream fis = new FileInputStream(info.out)) {
int[] count = new int[] {0};
FastInputStream in = FastInputStream.wrap(fis);
new JavaBinUpdateRequestCodec()
@@ -215,8 +214,6 @@ public class TestExportTool extends SolrCloudTestCase {
count[0]++;
});
assertTrue(count[0] >= expected);
- } finally {
- fis.close();
}
}
@@ -228,7 +225,7 @@ public class TestExportTool extends SolrCloudTestCase {
JsonRecordReader jsonReader;
Reader rdr;
- jsonReader = JsonRecordReader.getInst("/", Arrays.asList("$FQN:/**"));
+ jsonReader = JsonRecordReader.getInst("/", List.of("$FQN:/**"));
rdr = new InputStreamReader(new FileInputStream(info.out), StandardCharsets.UTF_8);
try {
int[] count = new int[] {0};
diff --git a/solr/packaging/test/test_export.bats b/solr/packaging/test/test_export.bats
index 40f5cbf609b..53f5180fcea 100644
--- a/solr/packaging/test/test_export.bats
+++ b/solr/packaging/test/test_export.bats
@@ -29,11 +29,41 @@ teardown() {
}
@test "Check export command" {
- run solr start -c
- run solr create_collection -c COLL_NAME
- run solr export -url "http://localhost:8983/solr/COLL_NAME" -query "*:* -id:test" -out "${BATS_TEST_TMPDIR}/output"
+ run solr start -c -e techproducts
+ run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -out "${BATS_TEST_TMPDIR}/output"
+
refute_output --partial 'Unrecognized option'
assert_output --partial 'Export complete'
+
+ assert [ -e ${BATS_TEST_TMPDIR}/output.jsonl ]
+
+ run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test"
+ assert [ -e techproducts.jsonl ]
+ rm techproducts.jsonl
+
+ run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -format javabin
+ assert [ -e techproducts.javabin ]
+ rm techproducts.javabin
+
+ # old pattern of putting a suffix on the out that controlled the format no longer supported ;-).
+ run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -out "${BATS_TEST_TMPDIR}/output.javabin"
+ assert [ -e ${BATS_TEST_TMPDIR}/output.javabin.jsonl ]
+
+ run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -out "${BATS_TEST_TMPDIR}"
+ assert [ -e ${BATS_TEST_TMPDIR}/techproducts.jsonl ]
+
+ run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -format jsonl -out "${BATS_TEST_TMPDIR}/output"
+ assert [ -e ${BATS_TEST_TMPDIR}/output.jsonl ]
+
+ run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -compress -format jsonl -out "${BATS_TEST_TMPDIR}/output"
+ assert [ -e ${BATS_TEST_TMPDIR}/output.jsonl.gz ]
+
+ # Confirm we don't properly support json right now.
+ run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -format json -out "${BATS_TEST_TMPDIR}/output.json"
+ assert_output --partial 'format must be one of:'
+ refute [ -e ${BATS_TEST_TMPDIR}/output.json ]
+
+
}
@test "export fails on non cloud mode" {
diff --git a/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc b/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc
index acd466242f7..72495320aa6 100644
--- a/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc
+++ b/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc
@@ -1384,6 +1384,8 @@ Examples of this command:
The `export` command will allow you to export documents from a collection in either JSON or Javabin format.
All documents can be exported, or only those that match a query.
+NOTE: This hasn't been tested with nested child documents and your results will vary.
+
NOTE: The `export` command only works with in a Solr running in cloud mode.
`bin/solr export [options]`
@@ -1392,7 +1394,7 @@ NOTE: The `export` command only works with in a Solr running in cloud mode.
The `bin/solr export` command takes the following parameters:
-`url`::
+`-url`::
+
[%autowidth,frame=none]
|===
@@ -1403,7 +1405,7 @@ Fully-qualified address to a collection.
+
*Example*: `-url http://localhost:8983/solr/techproducts`
-`format`::
+`-format`::
+
[%autowidth,frame=none]
|===
@@ -1411,7 +1413,7 @@ Fully-qualified address to a collection.
|===
+
The file format of the export, `jsonl` or `javabin`.
-Choosing `javabin` exports to a file with extension `.javabin` which is the native Solr format.
+Choosing `javabin` exports in the native Solr format.
This is compact and faster to import.
`-out`::
@@ -1421,12 +1423,20 @@ This is compact and faster to import.
|Optional |Default: _see description_
|===
+
-The file name of the export.
-If the file name ends with `json.gz` the output will be compressed into a .gz file.
+Either the path to a directory for the export to happen in, or a specific file name to populate.
+
-If not provided, a file will be created with the name of the collection, as in `<collection>.json`.
+If not provided, a file will be created with the name of the collection, as in `<collection>.jsonl`.
-`query`::
+`-compress`::
++
+[%autowidth,frame=none]
+|===
+|Optional |Default: false
+|===
++
+If you specify `-compress` then the resulting outputting file with will be gzipped, for example `<collection>.jsonl.gz`.
+
+`-query`::
+
[%autowidth,frame=none]
|===
@@ -1436,7 +1446,7 @@ If not provided, a file will be created with the name of the collection, as in `
A custom query.
The default is `\*:*` which will export all documents.
-`fields`::
+`-fields`::
+
[%autowidth,frame=none]
|===
@@ -1446,7 +1456,7 @@ The default is `\*:*` which will export all documents.
A comma separated list of fields to be exported.
If not provided, all fields will be included.
-`limit`::
+`-limit`::
+
[%autowidth,frame=none]
|===
@@ -1463,11 +1473,11 @@ Export all documents from a collection `gettingstarted`:
[source,bash]
bin/solr export -url http://localhost:8983/solr/gettingstarted limit -1
-Export all documents of collection `gettingstarted` into a file called `1MDocs.json.gz` as a zipped JSON file:
+Export all documents of collection `gettingstarted` into a file called `1MDocs.json.gz` as a compressed JSONL file:
[source,bash]
----
-bin/solr export -url http://localhost:8983/solr/gettingstarted -1 -out 1MDocs.json.gz
+bin/solr export -url http://localhost:8983/solr/gettingstarted -1 -format jsonl -compress -out 1MDocs
----
=== Importing Documents into a Collection
@@ -1486,24 +1496,22 @@ Once you have exported documents in a file, you can use the xref:indexing-guide:
The `api` command will allow you to send an arbitrary HTTP request to a Solr API endpoint.
-`bin/solr api -get `
+`bin/solr api -get http://localhost:8983/solr/COLL_NAME/sql?stmt=select+id+from+COLL_NAME+limit+10`
-http://localhost:8983/solr/COLL_NAME/sql?stmt=select+id+from+COLL_NAME+limit+10
+`bin/solr api -help`
-`bin/solr export -help`
+The `bin/solr api` command takes the following parameters:
-The `bin/solr export` command takes the following parameters:
-
-`get`::
+`-get`::
+
[%autowidth,frame=none]
|===
-s|Required |Default: none
+|Required |Default: none
|===
+
Fully-qualified address to a collection.
+
-*Example*: `-get http://localhost:8983/solr/COLL_NAME/sql?stmt=select+id+from+COLL_NAME+limit+10`
+*Example*: `bin/solr api -get http://localhost:8983/solr/COLL_NAME/sql?stmt=select+id+from+COLL_NAME+limit+10`
=== API
@@ -1514,9 +1522,9 @@ If you have configured basicAuth or TLS with your Solr you may find this easier
$ bin/solr api api -get http://localhost:8983/solr/techproducts/select?q=*:*
-Here is an example of sending a SQL query to the techproducts /sql end point (assumes you start Solr in Cloud mode with the SQL module enabled):
+Here is an example of sending a SQL query to the techproducts /sql end point (assumes you started Solr in Cloud mode with the SQL module enabled):
[source,bash]
$ bin/solr api api -get http://localhost:8983/solr/techproducts/sql?stmt=select+id+from+techproducts+limit+10
-Results are streamed to the console.
+Results are streamed to the terminal.