You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by ep...@apache.org on 2023/05/02 16:01:43 UTC

[solr] branch main updated: SOLR-16764: Be clearer that bin/solr export uses JSONL, not JSON format. Fix up how files are named and output and add tests. (#1614)

This is an automated email from the ASF dual-hosted git repository.

epugh pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/main by this push:
     new 4ff4a311d59 SOLR-16764: Be clearer that bin/solr export uses JSONL, not JSON format. Fix up how files are named and output and add tests. (#1614)
4ff4a311d59 is described below

commit 4ff4a311d590f2619a321e2d537b1a406599ac7d
Author: Eric Pugh <ep...@opensourceconnections.com>
AuthorDate: Tue May 2 12:01:37 2023 -0400

    SOLR-16764: Be clearer that bin/solr export uses JSONL, not JSON format. Fix up how files are named and output and add tests. (#1614)
    
    * We are exporting with JSON with Lines, so lets do .jsonl
    * Allow the -out parameter to specific a directory as well as a file, introduce a new seperate -compress switch
    * document new params for export, and clean up formatting.
---
 solr/CHANGES.txt                                   |  3 ++
 .../src/java/org/apache/solr/cli/ExportTool.java   | 55 ++++++++++++++--------
 .../test/org/apache/solr/cli/TestExportTool.java   | 27 +++++------
 solr/packaging/test/test_export.bats               | 36 ++++++++++++--
 .../pages/solr-control-script-reference.adoc       | 50 +++++++++++---------
 5 files changed, 112 insertions(+), 59 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index a6f8933b832..4ad05180a0c 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -123,6 +123,9 @@ Optimizations
 
 * SOLR-16693: For query timeAllowed, switch from ExitableDirectoryReader to TimeLimitingBulkScorer (David Smiley)
 
+* SOLR-16764: Clarify that ExportTool exports documents in JSON with Lines format, not standard JSON.  Add explicit -compress option for gzipping output.
+  Add ability to specific a directory for output along with a specific file when using -out. (Eric Pugh)
+
 Bug Fixes
 ---------------------
 
diff --git a/solr/core/src/java/org/apache/solr/cli/ExportTool.java b/solr/core/src/java/org/apache/solr/cli/ExportTool.java
index f230c37e7e1..71c3ae1da80 100644
--- a/solr/core/src/java/org/apache/solr/cli/ExportTool.java
+++ b/solr/core/src/java/org/apache/solr/cli/ExportTool.java
@@ -33,6 +33,7 @@ import java.io.PrintStream;
 import java.io.Writer;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
+import java.nio.file.Path;
 import java.time.Instant;
 import java.time.format.DateTimeFormatter;
 import java.util.ArrayList;
@@ -97,6 +98,7 @@ public class ExportTool extends ToolBase {
   public abstract static class Info {
     String baseurl;
     String format;
+    boolean compress;
     String query;
     String coll;
     String out;
@@ -111,7 +113,7 @@ public class ExportTool extends ToolBase {
 
     public Info(String url) {
       setUrl(url);
-      setOutFormat(null, "jsonl");
+      setOutFormat(null, "jsonl", false);
     }
 
     public void setUrl(String url) {
@@ -126,21 +128,29 @@ public class ExportTool extends ToolBase {
       if (limit == -1) limit = Long.MAX_VALUE;
     }
 
-    public void setOutFormat(String out, String format) {
+    public void setOutFormat(String out, String format, boolean compress) {
+      this.compress = compress;
       this.format = format;
-      if (format == null) format = "jsonl";
-      if (!formats.contains(format)) {
-        throw new IllegalArgumentException("format must be one of :" + formats);
-      }
-
       this.out = out;
+      if (this.format == null) {
+        this.format = "jsonl";
+      }
+      if (!formats.contains(this.format)) {
+        throw new IllegalArgumentException("format must be one of: " + formats);
+      }
       if (this.out == null) {
-        this.out = JAVABIN.equals(format) ? coll + ".javabin" : coll + ".json";
+        this.out = coll;
+      } else if (Files.isDirectory(Path.of(this.out))) {
+        this.out = this.out + "/" + coll;
+      }
+      this.out = JAVABIN.equals(this.format) ? this.out + ".javabin" : this.out + ".jsonl";
+      if (compress) {
+        this.out = this.out + ".gz";
       }
     }
 
     DocsSink getSink() {
-      return JAVABIN.equals(format) ? new JavabinSink(this) : new JsonSink(this);
+      return JAVABIN.equals(format) ? new JavabinSink(this) : new JsonWithLinesSink(this);
     }
 
     abstract void exportDocs() throws Exception;
@@ -180,7 +190,8 @@ public class ExportTool extends ToolBase {
     String url = cli.getOptionValue("url");
     Info info = new MultiThreadedRunner(url);
     info.query = cli.getOptionValue("query", "*:*");
-    info.setOutFormat(cli.getOptionValue("out"), cli.getOptionValue("format"));
+    info.setOutFormat(
+        cli.getOptionValue("out"), cli.getOptionValue("format"), cli.hasOption("compress"));
     info.fields = cli.getOptionValue("fields");
     info.setLimit(cli.getOptionValue("limit", "100"));
     info.output = super.stdout;
@@ -215,14 +226,16 @@ public class ExportTool extends ToolBase {
           Option.builder("out")
               .hasArg()
               .required(false)
-              .desc("File name, defaults to 'collection-name.<format>'.")
+              .desc(
+                  "Path to output the exported data, and optionally the file name, defaults to 'collection-name'.")
               .build(),
           Option.builder("format")
               .hasArg()
               .required(false)
               .desc(
-                  "Output format for exported docs (json or javabin), defaulting to json. File extension would be .json.")
+                  "Output format for exported docs (jsonl or javabin), defaulting to jsonl, appended to the output file.")
               .build(),
+          Option.builder("compress").required(false).desc("Compress the output.").build(),
           Option.builder("limit")
               .hasArg()
               .required(false)
@@ -239,20 +252,21 @@ public class ExportTool extends ToolBase {
               .desc("Comma separated list of fields to export. By default all fields are fetched.")
               .build());
 
-  static class JsonSink extends DocsSink {
-    private CharArr charArr = new CharArr(1024 * 2);
+  static class JsonWithLinesSink extends DocsSink {
+    private final CharArr charArr = new CharArr(1024 * 2);
     JSONWriter jsonWriter = new JSONWriter(charArr, -1);
     private Writer writer;
 
-    public JsonSink(Info info) {
+    public JsonWithLinesSink(Info info) {
       this.info = info;
     }
 
     @Override
     public void start() throws IOException {
       fos = new FileOutputStream(info.out);
-      if (info.out.endsWith(".json.gz") || info.out.endsWith(".json."))
+      if (info.compress) {
         fos = new GZIPOutputStream(fos);
+      }
       if (info.bufferSize > 0) {
         fos = new BufferedOutputStream(fos, info.bufferSize);
       }
@@ -325,8 +339,9 @@ public class ExportTool extends ToolBase {
     @Override
     public void start() throws IOException {
       fos = new FileOutputStream(info.out);
-      if (info.out.endsWith(".json.gz") || info.out.endsWith(".json."))
+      if (info.compress) {
         fos = new GZIPOutputStream(fos);
+      }
       if (info.bufferSize > 0) {
         fos = new BufferedOutputStream(fos, info.bufferSize);
       }
@@ -346,7 +361,7 @@ public class ExportTool extends ToolBase {
       fos.close();
     }
 
-    private BiConsumer<String, Object> bic =
+    private final BiConsumer<String, Object> bic =
         new BiConsumer<>() {
           @Override
           public void accept(String s, Object o) {
@@ -378,7 +393,7 @@ public class ExportTool extends ToolBase {
     SolrDocument EOFDOC = new SolrDocument();
     volatile boolean failed = false;
     Map<String, CoreHandler> corehandlers = new HashMap<>();
-    private long startTime;
+    private final long startTime;
 
     @SuppressForbidden(reason = "Need to print out time")
     public MultiThreadedRunner(String url) {
@@ -460,7 +475,7 @@ public class ExportTool extends ToolBase {
       consumerThreadpool.submit(
           () -> {
             while (true) {
-              SolrDocument doc = null;
+              SolrDocument doc;
               try {
                 doc = queue.poll(30, TimeUnit.SECONDS);
               } catch (InterruptedException e) {
diff --git a/solr/core/src/test/org/apache/solr/cli/TestExportTool.java b/solr/core/src/test/org/apache/solr/cli/TestExportTool.java
index 82424998ff0..dd57cde022f 100644
--- a/solr/core/src/test/org/apache/solr/cli/TestExportTool.java
+++ b/solr/core/src/test/org/apache/solr/cli/TestExportTool.java
@@ -23,8 +23,8 @@ import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.nio.charset.StandardCharsets;
-import java.util.Arrays;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.function.Predicate;
 import org.apache.lucene.tests.util.TestUtil;
@@ -82,7 +82,7 @@ public class TestExportTool extends SolrCloudTestCase {
 
       ExportTool.Info info = new ExportTool.MultiThreadedRunner(url);
       String absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".json";
-      info.setOutFormat(absolutePath, "jsonl");
+      info.setOutFormat(absolutePath, "jsonl", false);
       info.setLimit("200");
       info.fields = "id,desc_s,a_dt";
       info.exportDocs();
@@ -91,7 +91,7 @@ public class TestExportTool extends SolrCloudTestCase {
 
       info = new ExportTool.MultiThreadedRunner(url);
       absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".json";
-      info.setOutFormat(absolutePath, "jsonl");
+      info.setOutFormat(absolutePath, "jsonl", false);
       info.setLimit("-1");
       info.fields = "id,desc_s";
       info.exportDocs();
@@ -100,7 +100,7 @@ public class TestExportTool extends SolrCloudTestCase {
 
       info = new ExportTool.MultiThreadedRunner(url);
       absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".javabin";
-      info.setOutFormat(absolutePath, "javabin");
+      info.setOutFormat(absolutePath, "javabin", false);
       info.setLimit("200");
       info.fields = "id,desc_s";
       info.exportDocs();
@@ -109,7 +109,7 @@ public class TestExportTool extends SolrCloudTestCase {
 
       info = new ExportTool.MultiThreadedRunner(url);
       absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".javabin";
-      info.setOutFormat(absolutePath, "javabin");
+      info.setOutFormat(absolutePath, "javabin", false);
       info.setLimit("-1");
       info.fields = "id,desc_s";
       info.exportDocs();
@@ -169,13 +169,13 @@ public class TestExportTool extends SolrCloudTestCase {
       }
       assertEquals(docCount, totalDocsFromCores);
 
-      ExportTool.MultiThreadedRunner info = null;
-      String absolutePath = null;
+      ExportTool.MultiThreadedRunner info;
+      String absolutePath;
 
       info = new ExportTool.MultiThreadedRunner(url);
       info.output = System.out;
       absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".javabin";
-      info.setOutFormat(absolutePath, "javabin");
+      info.setOutFormat(absolutePath, "javabin", false);
       info.setLimit("-1");
       info.exportDocs();
       assertJavabinDocsCount(info, docCount);
@@ -186,11 +186,11 @@ public class TestExportTool extends SolrCloudTestCase {
       info = new ExportTool.MultiThreadedRunner(url);
       info.output = System.out;
       absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".json";
-      info.setOutFormat(absolutePath, "jsonl");
+      info.setOutFormat(absolutePath, "jsonl", false);
       info.fields = "id,desc_s";
       info.setLimit("-1");
       info.exportDocs();
-      long actual = ((ExportTool.JsonSink) info.sink).info.docsWritten.get();
+      long actual = info.sink.info.docsWritten.get();
       assertTrue(
           "docs written :" + actual + "docs produced : " + info.docsWritten.get(),
           actual >= docCount);
@@ -203,8 +203,7 @@ public class TestExportTool extends SolrCloudTestCase {
   private void assertJavabinDocsCount(ExportTool.Info info, int expected) throws IOException {
     assertTrue(
         "" + info.docsWritten.get() + " expected " + expected, info.docsWritten.get() >= expected);
-    FileInputStream fis = new FileInputStream(info.out);
-    try {
+    try (FileInputStream fis = new FileInputStream(info.out)) {
       int[] count = new int[] {0};
       FastInputStream in = FastInputStream.wrap(fis);
       new JavaBinUpdateRequestCodec()
@@ -215,8 +214,6 @@ public class TestExportTool extends SolrCloudTestCase {
                 count[0]++;
               });
       assertTrue(count[0] >= expected);
-    } finally {
-      fis.close();
     }
   }
 
@@ -228,7 +225,7 @@ public class TestExportTool extends SolrCloudTestCase {
 
     JsonRecordReader jsonReader;
     Reader rdr;
-    jsonReader = JsonRecordReader.getInst("/", Arrays.asList("$FQN:/**"));
+    jsonReader = JsonRecordReader.getInst("/", List.of("$FQN:/**"));
     rdr = new InputStreamReader(new FileInputStream(info.out), StandardCharsets.UTF_8);
     try {
       int[] count = new int[] {0};
diff --git a/solr/packaging/test/test_export.bats b/solr/packaging/test/test_export.bats
index 40f5cbf609b..53f5180fcea 100644
--- a/solr/packaging/test/test_export.bats
+++ b/solr/packaging/test/test_export.bats
@@ -29,11 +29,41 @@ teardown() {
 }
 
 @test "Check export command" {
-  run solr start -c
-  run solr create_collection -c COLL_NAME
-  run solr export -url "http://localhost:8983/solr/COLL_NAME" -query "*:* -id:test" -out "${BATS_TEST_TMPDIR}/output"
+  run solr start -c -e techproducts
+  run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -out "${BATS_TEST_TMPDIR}/output"
+
   refute_output --partial 'Unrecognized option'
   assert_output --partial 'Export complete'
+
+  assert [ -e ${BATS_TEST_TMPDIR}/output.jsonl ]
+
+  run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test"
+  assert [ -e techproducts.jsonl ]
+  rm techproducts.jsonl
+
+  run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -format javabin
+  assert [ -e techproducts.javabin ]
+  rm techproducts.javabin
+
+  # old pattern of putting a suffix on the out that controlled the format no longer supported ;-).
+  run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -out "${BATS_TEST_TMPDIR}/output.javabin"
+  assert [ -e ${BATS_TEST_TMPDIR}/output.javabin.jsonl ]
+
+  run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -out "${BATS_TEST_TMPDIR}"
+  assert [ -e ${BATS_TEST_TMPDIR}/techproducts.jsonl ]
+
+  run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -format jsonl -out "${BATS_TEST_TMPDIR}/output"
+  assert [ -e ${BATS_TEST_TMPDIR}/output.jsonl ]
+
+  run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -compress -format jsonl -out "${BATS_TEST_TMPDIR}/output"
+  assert [ -e ${BATS_TEST_TMPDIR}/output.jsonl.gz ]
+
+  # Confirm we don't properly support json right now.
+  run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -format json -out "${BATS_TEST_TMPDIR}/output.json"
+  assert_output --partial 'format must be one of:'
+  refute [ -e ${BATS_TEST_TMPDIR}/output.json ]
+
+
 }
 
 @test "export fails on non cloud mode" {
diff --git a/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc b/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc
index acd466242f7..72495320aa6 100644
--- a/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc
+++ b/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc
@@ -1384,6 +1384,8 @@ Examples of this command:
 The `export` command will allow you to export documents from a collection in either JSON or Javabin format.
 All documents can be exported, or only those that match a query.
 
+NOTE: This hasn't been tested with nested child documents and your results will vary.
+
 NOTE: The `export` command only works with in a Solr running in cloud mode.
 
 `bin/solr export [options]`
@@ -1392,7 +1394,7 @@ NOTE: The `export` command only works with in a Solr running in cloud mode.
 
 The `bin/solr export` command takes the following parameters:
 
-`url`::
+`-url`::
 +
 [%autowidth,frame=none]
 |===
@@ -1403,7 +1405,7 @@ Fully-qualified address to a collection.
 +
 *Example*: `-url http://localhost:8983/solr/techproducts`
 
-`format`::
+`-format`::
 +
 [%autowidth,frame=none]
 |===
@@ -1411,7 +1413,7 @@ Fully-qualified address to a collection.
 |===
 +
 The file format of the export, `jsonl` or `javabin`.
-Choosing `javabin` exports to a file with extension `.javabin` which is the native Solr format.
+Choosing `javabin` exports in the native Solr format.
 This is compact and faster to import.
 
 `-out`::
@@ -1421,12 +1423,20 @@ This is compact and faster to import.
 |Optional |Default: _see description_
 |===
 +
-The file name of the export.
-If the file name ends with `json.gz` the output will be compressed into a .gz file.
+Either the path to a directory for the export to happen in, or a specific file name to populate.
 +
-If not provided, a file will be created with the name of the collection, as in `<collection>.json`.
+If not provided, a file will be created with the name of the collection, as in `<collection>.jsonl`.
 
-`query`::
+`-compress`::
++
+[%autowidth,frame=none]
+|===
+|Optional |Default: false
+|===
++
+If you specify `-compress` then the resulting outputting file with will be gzipped, for example `<collection>.jsonl.gz`.
+
+`-query`::
 +
 [%autowidth,frame=none]
 |===
@@ -1436,7 +1446,7 @@ If not provided, a file will be created with the name of the collection, as in `
 A custom query.
 The default is `\*:*` which will export all documents.
 
-`fields`::
+`-fields`::
 +
 [%autowidth,frame=none]
 |===
@@ -1446,7 +1456,7 @@ The default is `\*:*` which will export all documents.
 A comma separated list of fields to be exported.
 If not provided, all fields will be included.
 
-`limit`::
+`-limit`::
 +
 [%autowidth,frame=none]
 |===
@@ -1463,11 +1473,11 @@ Export all documents from a collection `gettingstarted`:
 [source,bash]
 bin/solr export -url http://localhost:8983/solr/gettingstarted limit -1
 
-Export all documents of collection `gettingstarted` into a file called `1MDocs.json.gz` as a zipped JSON file:
+Export all documents of collection `gettingstarted` into a file called `1MDocs.json.gz` as a compressed JSONL file:
 
 [source,bash]
 ----
-bin/solr export -url http://localhost:8983/solr/gettingstarted -1 -out 1MDocs.json.gz
+bin/solr export -url http://localhost:8983/solr/gettingstarted -1 -format jsonl -compress -out 1MDocs
 ----
 
 === Importing Documents into a Collection
@@ -1486,24 +1496,22 @@ Once you have exported documents in a file, you can use the xref:indexing-guide:
 
 The `api` command will allow you to send an arbitrary HTTP request to a Solr API endpoint.
 
-`bin/solr api -get `
+`bin/solr api -get http://localhost:8983/solr/COLL_NAME/sql?stmt=select+id+from+COLL_NAME+limit+10`
 
-http://localhost:8983/solr/COLL_NAME/sql?stmt=select+id+from+COLL_NAME+limit+10
+`bin/solr api -help`
 
-`bin/solr export -help`
+The `bin/solr api` command takes the following parameters:
 
-The `bin/solr export` command takes the following parameters:
-
-`get`::
+`-get`::
 +
 [%autowidth,frame=none]
 |===
-s|Required |Default: none
+|Required |Default: none
 |===
 +
 Fully-qualified address to a collection.
 +
-*Example*: `-get http://localhost:8983/solr/COLL_NAME/sql?stmt=select+id+from+COLL_NAME+limit+10`
+*Example*: `bin/solr api -get http://localhost:8983/solr/COLL_NAME/sql?stmt=select+id+from+COLL_NAME+limit+10`
 
 === API
 
@@ -1514,9 +1522,9 @@ If you have configured basicAuth or TLS with your Solr you may find this easier
 $ bin/solr api api -get http://localhost:8983/solr/techproducts/select?q=*:*
 
 
-Here is an example of sending a SQL query to the techproducts /sql end point (assumes you start Solr in Cloud mode with the SQL module enabled):
+Here is an example of sending a SQL query to the techproducts /sql end point (assumes you started Solr in Cloud mode with the SQL module enabled):
 
 [source,bash]
 $ bin/solr api api -get http://localhost:8983/solr/techproducts/sql?stmt=select+id+from+techproducts+limit+10
 
-Results are streamed to the console.
+Results are streamed to the terminal.