You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by ep...@apache.org on 2023/05/09 11:05:08 UTC

[solr] branch main updated: SOLR-16782: Export tool should export in JSON that matches Solr Import Format (#1623)

This is an automated email from the ASF dual-hosted git repository.

epugh pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/main by this push:
     new ad4875d9bb2 SOLR-16782: Export tool should export in JSON that matches Solr Import Format (#1623)
ad4875d9bb2 is described below

commit ad4875d9bb204c0e98b5178bca804cba897929c2
Author: Eric Pugh <ep...@opensourceconnections.com>
AuthorDate: Tue May 9 07:05:01 2023 -0400

    SOLR-16782: Export tool should export in JSON that matches Solr Import Format (#1623)
    
    Fixed bug that JSON output format was actually a JSON with Lines output format by introducing jsonl output parameter.   Add proper JSON output format.
---
 solr/CHANGES.txt                                   |   2 +
 .../src/java/org/apache/solr/cli/ExportTool.java   | 109 +++++++++++++++++++--
 .../test/org/apache/solr/cli/TestExportTool.java   |  33 ++++++-
 solr/packaging/test/test_export.bats               |  16 +--
 .../pages/solr-control-script-reference.adoc       |  52 +++++++---
 5 files changed, 178 insertions(+), 34 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 38276303ae7..85fb2c52a4a 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -113,6 +113,8 @@ Improvements
 * SOLR-16394: The v2 "restore" API has been tweaked to be more intuitive.  The top-level "restore-collection" command
   specifier has been removed, and the API now lives at the new path `POST /api/backups/backupName/restore`. (Jason Gerlowski)
 
+* SOLR-16782: bin/solr export tool now supports JSON export format as well as the existing JSON With Lines format. (Eric Pugh)
+
 Optimizations
 ---------------------
 
diff --git a/solr/core/src/java/org/apache/solr/cli/ExportTool.java b/solr/core/src/java/org/apache/solr/cli/ExportTool.java
index 71c3ae1da80..a262018206a 100644
--- a/solr/core/src/java/org/apache/solr/cli/ExportTool.java
+++ b/solr/core/src/java/org/apache/solr/cli/ExportTool.java
@@ -19,6 +19,7 @@ package org.apache.solr.cli;
 
 import static org.apache.solr.common.params.CommonParams.FL;
 import static org.apache.solr.common.params.CommonParams.JAVABIN;
+import static org.apache.solr.common.params.CommonParams.JSON;
 import static org.apache.solr.common.params.CommonParams.Q;
 import static org.apache.solr.common.params.CommonParams.SORT;
 import static org.apache.solr.common.util.JavaBinCodec.SOLRINPUTDOC;
@@ -133,7 +134,7 @@ public class ExportTool extends ToolBase {
       this.format = format;
       this.out = out;
       if (this.format == null) {
-        this.format = "jsonl";
+        this.format = JSON;
       }
       if (!formats.contains(this.format)) {
         throw new IllegalArgumentException("format must be one of: " + formats);
@@ -143,14 +144,26 @@ public class ExportTool extends ToolBase {
       } else if (Files.isDirectory(Path.of(this.out))) {
         this.out = this.out + "/" + coll;
       }
-      this.out = JAVABIN.equals(this.format) ? this.out + ".javabin" : this.out + ".jsonl";
+      this.out = this.out + '.' + this.format;
       if (compress) {
         this.out = this.out + ".gz";
       }
     }
 
     DocsSink getSink() {
-      return JAVABIN.equals(format) ? new JavabinSink(this) : new JsonWithLinesSink(this);
+      DocsSink docSink = null;
+      switch (format) {
+        case JAVABIN:
+          docSink = new JavabinSink(this);
+          break;
+        case JSON:
+          docSink = new JsonSink(this);
+          break;
+        case "jsonl":
+          docSink = new JsonWithLinesSink(this);
+          break;
+      }
+      return docSink;
     }
 
     abstract void exportDocs() throws Exception;
@@ -183,7 +196,7 @@ public class ExportTool extends ToolBase {
     }
   }
 
-  static Set<String> formats = Set.of(JAVABIN, "jsonl");
+  static Set<String> formats = Set.of(JAVABIN, JSON, "jsonl");
 
   @Override
   public void runImpl(CommandLine cli) throws Exception {
@@ -232,8 +245,7 @@ public class ExportTool extends ToolBase {
           Option.builder("format")
               .hasArg()
               .required(false)
-              .desc(
-                  "Output format for exported docs (jsonl or javabin), defaulting to jsonl, appended to the output file.")
+              .desc("Output format for exported docs (json, jsonl or javabin), defaulting to json.")
               .build(),
           Option.builder("compress").required(false).desc("Compress the output.").build(),
           Option.builder("limit")
@@ -329,6 +341,91 @@ public class ExportTool extends ToolBase {
     }
   }
 
+  static class JsonSink extends DocsSink {
+    private final CharArr charArr = new CharArr(1024 * 2);
+    JSONWriter jsonWriter = new JSONWriter(charArr, -1);
+    private Writer writer;
+    private boolean firstDoc = true;
+
+    public JsonSink(Info info) {
+      this.info = info;
+    }
+
+    @Override
+    public void start() throws IOException {
+      fos = new FileOutputStream(info.out);
+      if (info.compress) {
+        fos = new GZIPOutputStream(fos);
+      }
+      if (info.bufferSize > 0) {
+        fos = new BufferedOutputStream(fos, info.bufferSize);
+      }
+      writer = new OutputStreamWriter(fos, StandardCharsets.UTF_8);
+      writer.append('[');
+    }
+
+    @Override
+    public void end() throws IOException {
+      writer.append(']');
+      writer.flush();
+      fos.flush();
+      fos.close();
+    }
+
+    @Override
+    public synchronized void accept(SolrDocument doc) throws IOException {
+      charArr.reset();
+      Map<String, Object> m = CollectionUtil.newLinkedHashMap(doc.size());
+      doc.forEach(
+          (s, field) -> {
+            if (s.equals("_version_") || s.equals("_roor_")) return;
+            if (field instanceof List) {
+              if (((List<?>) field).size() == 1) {
+                field = ((List<?>) field).get(0);
+              }
+            }
+            field = constructDateStr(field);
+            if (field instanceof List) {
+              List<?> list = (List<?>) field;
+              if (hasdate(list)) {
+                ArrayList<Object> listCopy = new ArrayList<>(list.size());
+                for (Object o : list) listCopy.add(constructDateStr(o));
+                field = listCopy;
+              }
+            }
+            m.put(s, field);
+          });
+      if (firstDoc) {
+        firstDoc = false;
+      } else {
+        writer.append(',');
+      }
+      jsonWriter.write(m);
+      writer.write(charArr.getArray(), charArr.getStart(), charArr.getEnd());
+      writer.append('\n');
+      super.accept(doc);
+    }
+
+    private boolean hasdate(List<?> list) {
+      boolean hasDate = false;
+      for (Object o : list) {
+        if (o instanceof Date) {
+          hasDate = true;
+          break;
+        }
+      }
+      return hasDate;
+    }
+
+    private Object constructDateStr(Object field) {
+      if (field instanceof Date) {
+        field =
+            DateTimeFormatter.ISO_INSTANT.format(Instant.ofEpochMilli(((Date) field).getTime()));
+      }
+      return field;
+    }
+  }
+
   static class JavabinSink extends DocsSink {
     JavaBinCodec codec;
 
diff --git a/solr/core/src/test/org/apache/solr/cli/TestExportTool.java b/solr/core/src/test/org/apache/solr/cli/TestExportTool.java
index dd57cde022f..56c6e9b8656 100644
--- a/solr/core/src/test/org/apache/solr/cli/TestExportTool.java
+++ b/solr/core/src/test/org/apache/solr/cli/TestExportTool.java
@@ -31,7 +31,7 @@ import org.apache.lucene.tests.util.TestUtil;
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.client.solrj.SolrClient;
 import org.apache.solr.client.solrj.SolrQuery;
-import org.apache.solr.client.solrj.impl.HttpSolrClient;
+import org.apache.solr.client.solrj.impl.Http2SolrClient;
 import org.apache.solr.client.solrj.request.AbstractUpdateRequest;
 import org.apache.solr.client.solrj.request.CollectionAdminRequest;
 import org.apache.solr.client.solrj.request.JavaBinUpdateRequestCodec;
@@ -81,7 +81,7 @@ public class TestExportTool extends SolrCloudTestCase {
       String url = cluster.getRandomJetty(random()).getBaseUrl() + "/" + COLLECTION_NAME;
 
       ExportTool.Info info = new ExportTool.MultiThreadedRunner(url);
-      String absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".json";
+      String absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".jsonl";
       info.setOutFormat(absolutePath, "jsonl", false);
       info.setLimit("200");
       info.fields = "id,desc_s,a_dt";
@@ -90,7 +90,7 @@ public class TestExportTool extends SolrCloudTestCase {
       assertJsonDocsCount(info, 200, record -> "2019-09-30T05:58:03Z".equals(record.get("a_dt")));
 
       info = new ExportTool.MultiThreadedRunner(url);
-      absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".json";
+      absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".jsonl";
       info.setOutFormat(absolutePath, "jsonl", false);
       info.setLimit("-1");
       info.fields = "id,desc_s";
@@ -115,6 +115,24 @@ public class TestExportTool extends SolrCloudTestCase {
       info.exportDocs();
       assertJavabinDocsCount(info, 1000);
 
+      info = new ExportTool.MultiThreadedRunner(url);
+      absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".json";
+      info.setOutFormat(absolutePath, "json", false);
+      info.setLimit("200");
+      info.fields = "id,desc_s";
+      info.exportDocs();
+
+      assertJsonDocsCount2(info, 200);
+
+      info = new ExportTool.MultiThreadedRunner(url);
+      absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".json";
+      info.setOutFormat(absolutePath, "json", false);
+      info.setLimit("-1");
+      info.fields = "id,desc_s";
+      info.exportDocs();
+
+      assertJsonDocsCount2(info, 1000);
+
     } finally {
       cluster.shutdown();
     }
@@ -161,7 +179,7 @@ public class TestExportTool extends SolrCloudTestCase {
       long totalDocsFromCores = 0;
       for (Slice slice : coll.getSlices()) {
         Replica replica = slice.getLeader();
-        try (SolrClient client = new HttpSolrClient.Builder(replica.getBaseUrl()).build()) {
+        try (SolrClient client = new Http2SolrClient.Builder(replica.getBaseUrl()).build()) {
           long count = ExportTool.getDocCount(replica.getCoreName(), client);
           docCounts.put(replica.getCoreName(), count);
           totalDocsFromCores += count;
@@ -185,7 +203,7 @@ public class TestExportTool extends SolrCloudTestCase {
       }
       info = new ExportTool.MultiThreadedRunner(url);
       info.output = System.out;
-      absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".json";
+      absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".jsonl";
       info.setOutFormat(absolutePath, "jsonl", false);
       info.fields = "id,desc_s";
       info.setLimit("-1");
@@ -217,6 +235,11 @@ public class TestExportTool extends SolrCloudTestCase {
     }
   }
 
+  private void assertJsonDocsCount2(ExportTool.Info info, int expected) {
+    assertTrue(
+        "" + info.docsWritten.get() + " expected " + expected, info.docsWritten.get() >= expected);
+  }
+
   private void assertJsonDocsCount(
       ExportTool.Info info, int expected, Predicate<Map<String, Object>> predicate)
       throws IOException {
diff --git a/solr/packaging/test/test_export.bats b/solr/packaging/test/test_export.bats
index 53f5180fcea..0e9aba9aef5 100644
--- a/solr/packaging/test/test_export.bats
+++ b/solr/packaging/test/test_export.bats
@@ -35,11 +35,11 @@ teardown() {
   refute_output --partial 'Unrecognized option'
   assert_output --partial 'Export complete'
 
-  assert [ -e ${BATS_TEST_TMPDIR}/output.jsonl ]
+  assert [ -e ${BATS_TEST_TMPDIR}/output.json ]
 
   run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test"
-  assert [ -e techproducts.jsonl ]
-  rm techproducts.jsonl
+  assert [ -e techproducts.json ]
+  rm techproducts.json
 
   run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -format javabin
   assert [ -e techproducts.javabin ]
@@ -47,10 +47,10 @@ teardown() {
 
   # old pattern of putting a suffix on the out that controlled the format no longer supported ;-).
   run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -out "${BATS_TEST_TMPDIR}/output.javabin"
-  assert [ -e ${BATS_TEST_TMPDIR}/output.javabin.jsonl ]
+  assert [ -e ${BATS_TEST_TMPDIR}/output.javabin.json ]
 
   run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -out "${BATS_TEST_TMPDIR}"
-  assert [ -e ${BATS_TEST_TMPDIR}/techproducts.jsonl ]
+  assert [ -e ${BATS_TEST_TMPDIR}/techproducts.json ]
 
   run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -format jsonl -out "${BATS_TEST_TMPDIR}/output"
   assert [ -e ${BATS_TEST_TMPDIR}/output.jsonl ]
@@ -58,12 +58,6 @@ teardown() {
   run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -compress -format jsonl -out "${BATS_TEST_TMPDIR}/output"
   assert [ -e ${BATS_TEST_TMPDIR}/output.jsonl.gz ]
 
-  # Confirm we don't properly support json right now.
-  run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -format json -out "${BATS_TEST_TMPDIR}/output.json"
-  assert_output --partial 'format must be one of:'
-  refute [ -e ${BATS_TEST_TMPDIR}/output.json ]
-
-
 }
 
 @test "export fails on non cloud mode" {
diff --git a/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc b/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc
index 72495320aa6..9d92011663f 100644
--- a/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc
+++ b/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc
@@ -1381,7 +1381,7 @@ Examples of this command:
 
 === Exporting Documents to a File
 
-The `export` command will allow you to export documents from a collection in either JSON or Javabin format.
+The `export` command will allow you to export documents from a collection in JSON, JSON with Lines, or Javabin format.
 All documents can be exported, or only those that match a query.
 
 NOTE: This hasn't been tested with nested child documents and your results will vary.
@@ -1409,12 +1409,12 @@ Fully-qualified address to a collection.
 +
 [%autowidth,frame=none]
 |===
-|Optional |Default: `jsonl`
+|Optional |Default: `json`
 |===
 +
-The file format of the export, `jsonl` or `javabin`.
-Choosing `javabin` exports in the native Solr format.
-This is compact and faster to import.
+The file format of the export, `json`, `jsonl`, or `javabin`.
+Choosing `javabin` exports in the native Solr format, and is compact and fast to import.
+`jsonl` is the Json with Lines format, learn more at https://jsonlines.org/.
 
 `-out`::
 +
@@ -1425,7 +1425,7 @@ This is compact and faster to import.
 +
 Either the path to a directory for the export to happen in, or a specific file name to populate.
 +
-If not provided, a file will be created with the name of the collection, as in `<collection>.jsonl`.
+If not provided, a file will be created with the name of the collection, as in `<collection>.json`.
 
 `-compress`::
 +
@@ -1434,7 +1434,7 @@ If not provided, a file will be created with the name of the collection, as in `
 |Optional |Default: false
 |===
 +
-If you specify `-compress` then the resulting outputting file with will be gzipped, for example `<collection>.jsonl.gz`.
+If you specify `-compress` then the resulting outputting file with will be gzipped, for example `<collection>.json.gz`.
 
 `-query`::
 +
@@ -1473,24 +1473,52 @@ Export all documents from a collection `gettingstarted`:
 [source,bash]
 bin/solr export -url http://localhost:8983/solr/gettingstarted limit -1
 
-Export all documents of collection `gettingstarted` into a file called `1MDocs.json.gz` as a compressed JSONL file:
+Export all documents of collection `gettingstarted` into a file called `1MDocs.json.gz` as a compressed JSON file:
 
 [source,bash]
 ----
-bin/solr export -url http://localhost:8983/solr/gettingstarted -1 -format jsonl -compress -out 1MDocs
+bin/solr export -url http://localhost:8983/solr/gettingstarted -1 -format json -compress -out 1MDocs
 ----
 
 === Importing Documents into a Collection
 
 Once you have exported documents in a file, you can use the xref:indexing-guide:indexing-with-update-handlers.adoc[/update request handler] to import them to a new Solr collection.
 
-*Example: import `jsonl` files*
+*Example: import `json` files*
 
-`curl -X POST -d @gettingstarted.json http://localhost:8983/solr/gettingstarted/update/json/docs?commit=true`
+First export the documents, making sure to ignore any fields that are populated via a `copyField` by specifying what fields you want to export:
+
+[,console]
+----
+$ bin/solr export -url http://localhost:8983/solr/gettingstarted -fields id,name,manu,cat,features
+----
+
+Create a new collection to import the exported documents into:
+
+[,console]
+----
+$ bin/solr create_collection -c test_collection -n techproducts
+----
+
+Now import the data with either of these methods:
+
+[,console]
+----
+$ curl -X POST -d @gettingstarted.json 'http://localhost:8983/solr/test_collection/update/json/docs?commit=true'
+----
+or
+[,console]
+----
+$ curl -H 'Content-Type: application/json' -X POST -d @gettingstarted.json 'http://localhost:8983/solr/test_collection/update?commit=true'
+----
 
 *Example: import `javabin` files*
 
-`curl -X POST --header "Content-Type: application/javabin" --data-binary @gettingstarted.javabin http://localhost:8983/solr/gettingstarted/update?commit=true`
+[,console]
+----
+$ bin/solr export -url http://localhost:8983/solr/gettingstarted -format javabin -fields id,name,manu,cat,features
+$ curl -X POST --header "Content-Type: application/javabin" --data-binary @gettingstarted.javabin 'http://localhost:8983/solr/test_collection/update?commit=true'
+----
 
 == Interacting with API