You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by ep...@apache.org on 2023/05/09 11:05:08 UTC
[solr] branch main updated: SOLR-16782: Export tool should export in JSON that matches Solr Import Format (#1623)
This is an automated email from the ASF dual-hosted git repository.
epugh pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/main by this push:
new ad4875d9bb2 SOLR-16782: Export tool should export in JSON that matches Solr Import Format (#1623)
ad4875d9bb2 is described below
commit ad4875d9bb204c0e98b5178bca804cba897929c2
Author: Eric Pugh <ep...@opensourceconnections.com>
AuthorDate: Tue May 9 07:05:01 2023 -0400
SOLR-16782: Export tool should export in JSON that matches Solr Import Format (#1623)
Fixed bug that JSON output format was actually a JSON with Lines output format by introducing jsonl output parameter. Add proper JSON output format.
---
solr/CHANGES.txt | 2 +
.../src/java/org/apache/solr/cli/ExportTool.java | 109 +++++++++++++++++++--
.../test/org/apache/solr/cli/TestExportTool.java | 33 ++++++-
solr/packaging/test/test_export.bats | 16 +--
.../pages/solr-control-script-reference.adoc | 52 +++++++---
5 files changed, 178 insertions(+), 34 deletions(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 38276303ae7..85fb2c52a4a 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -113,6 +113,8 @@ Improvements
* SOLR-16394: The v2 "restore" API has been tweaked to be more intuitive. The top-level "restore-collection" command
specifier has been removed, and the API now lives at the new path `POST /api/backups/backupName/restore`. (Jason Gerlowski)
+* SOLR-16782: bin/solr export tool now supports JSON export format as well as the existing JSON With Lines format. (Eric Pugh)
+
Optimizations
---------------------
diff --git a/solr/core/src/java/org/apache/solr/cli/ExportTool.java b/solr/core/src/java/org/apache/solr/cli/ExportTool.java
index 71c3ae1da80..a262018206a 100644
--- a/solr/core/src/java/org/apache/solr/cli/ExportTool.java
+++ b/solr/core/src/java/org/apache/solr/cli/ExportTool.java
@@ -19,6 +19,7 @@ package org.apache.solr.cli;
import static org.apache.solr.common.params.CommonParams.FL;
import static org.apache.solr.common.params.CommonParams.JAVABIN;
+import static org.apache.solr.common.params.CommonParams.JSON;
import static org.apache.solr.common.params.CommonParams.Q;
import static org.apache.solr.common.params.CommonParams.SORT;
import static org.apache.solr.common.util.JavaBinCodec.SOLRINPUTDOC;
@@ -133,7 +134,7 @@ public class ExportTool extends ToolBase {
this.format = format;
this.out = out;
if (this.format == null) {
- this.format = "jsonl";
+ this.format = JSON;
}
if (!formats.contains(this.format)) {
throw new IllegalArgumentException("format must be one of: " + formats);
@@ -143,14 +144,26 @@ public class ExportTool extends ToolBase {
} else if (Files.isDirectory(Path.of(this.out))) {
this.out = this.out + "/" + coll;
}
- this.out = JAVABIN.equals(this.format) ? this.out + ".javabin" : this.out + ".jsonl";
+ this.out = this.out + '.' + this.format;
if (compress) {
this.out = this.out + ".gz";
}
}
DocsSink getSink() {
- return JAVABIN.equals(format) ? new JavabinSink(this) : new JsonWithLinesSink(this);
+ DocsSink docSink = null;
+ switch (format) {
+ case JAVABIN:
+ docSink = new JavabinSink(this);
+ break;
+ case JSON:
+ docSink = new JsonSink(this);
+ break;
+ case "jsonl":
+ docSink = new JsonWithLinesSink(this);
+ break;
+ }
+ return docSink;
}
abstract void exportDocs() throws Exception;
@@ -183,7 +196,7 @@ public class ExportTool extends ToolBase {
}
}
- static Set<String> formats = Set.of(JAVABIN, "jsonl");
+ static Set<String> formats = Set.of(JAVABIN, JSON, "jsonl");
@Override
public void runImpl(CommandLine cli) throws Exception {
@@ -232,8 +245,7 @@ public class ExportTool extends ToolBase {
Option.builder("format")
.hasArg()
.required(false)
- .desc(
- "Output format for exported docs (jsonl or javabin), defaulting to jsonl, appended to the output file.")
+ .desc("Output format for exported docs (json, jsonl or javabin), defaulting to json.")
.build(),
Option.builder("compress").required(false).desc("Compress the output.").build(),
Option.builder("limit")
@@ -329,6 +341,91 @@ public class ExportTool extends ToolBase {
}
}
+ static class JsonSink extends DocsSink {
+ private final CharArr charArr = new CharArr(1024 * 2);
+ JSONWriter jsonWriter = new JSONWriter(charArr, -1);
+ private Writer writer;
+ private boolean firstDoc = true;
+
+ public JsonSink(Info info) {
+ this.info = info;
+ }
+
+ @Override
+ public void start() throws IOException {
+ fos = new FileOutputStream(info.out);
+ if (info.compress) {
+ fos = new GZIPOutputStream(fos);
+ }
+ if (info.bufferSize > 0) {
+ fos = new BufferedOutputStream(fos, info.bufferSize);
+ }
+ writer = new OutputStreamWriter(fos, StandardCharsets.UTF_8);
+ writer.append('[');
+ }
+
+ @Override
+ public void end() throws IOException {
+ writer.append(']');
+ writer.flush();
+ fos.flush();
+ fos.close();
+ }
+
+ @Override
+ public synchronized void accept(SolrDocument doc) throws IOException {
+ charArr.reset();
+ Map<String, Object> m = CollectionUtil.newLinkedHashMap(doc.size());
+ doc.forEach(
+ (s, field) -> {
+ if (s.equals("_version_") || s.equals("_roor_")) return;
+ if (field instanceof List) {
+ if (((List<?>) field).size() == 1) {
+ field = ((List<?>) field).get(0);
+ }
+ }
+ field = constructDateStr(field);
+ if (field instanceof List) {
+ List<?> list = (List<?>) field;
+ if (hasdate(list)) {
+ ArrayList<Object> listCopy = new ArrayList<>(list.size());
+ for (Object o : list) listCopy.add(constructDateStr(o));
+ field = listCopy;
+ }
+ }
+ m.put(s, field);
+ });
+ if (firstDoc) {
+ firstDoc = false;
+ } else {
+ writer.append(',');
+ }
+ jsonWriter.write(m);
+ writer.write(charArr.getArray(), charArr.getStart(), charArr.getEnd());
+ writer.append('\n');
+ super.accept(doc);
+ }
+
+ private boolean hasdate(List<?> list) {
+ boolean hasDate = false;
+ for (Object o : list) {
+ if (o instanceof Date) {
+ hasDate = true;
+ break;
+ }
+ }
+ return hasDate;
+ }
+
+ private Object constructDateStr(Object field) {
+ if (field instanceof Date) {
+ field =
+ DateTimeFormatter.ISO_INSTANT.format(Instant.ofEpochMilli(((Date) field).getTime()));
+ }
+ return field;
+ }
+ }
+
static class JavabinSink extends DocsSink {
JavaBinCodec codec;
diff --git a/solr/core/src/test/org/apache/solr/cli/TestExportTool.java b/solr/core/src/test/org/apache/solr/cli/TestExportTool.java
index dd57cde022f..56c6e9b8656 100644
--- a/solr/core/src/test/org/apache/solr/cli/TestExportTool.java
+++ b/solr/core/src/test/org/apache/solr/cli/TestExportTool.java
@@ -31,7 +31,7 @@ import org.apache.lucene.tests.util.TestUtil;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrQuery;
-import org.apache.solr.client.solrj.impl.HttpSolrClient;
+import org.apache.solr.client.solrj.impl.Http2SolrClient;
import org.apache.solr.client.solrj.request.AbstractUpdateRequest;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.JavaBinUpdateRequestCodec;
@@ -81,7 +81,7 @@ public class TestExportTool extends SolrCloudTestCase {
String url = cluster.getRandomJetty(random()).getBaseUrl() + "/" + COLLECTION_NAME;
ExportTool.Info info = new ExportTool.MultiThreadedRunner(url);
- String absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".json";
+ String absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".jsonl";
info.setOutFormat(absolutePath, "jsonl", false);
info.setLimit("200");
info.fields = "id,desc_s,a_dt";
@@ -90,7 +90,7 @@ public class TestExportTool extends SolrCloudTestCase {
assertJsonDocsCount(info, 200, record -> "2019-09-30T05:58:03Z".equals(record.get("a_dt")));
info = new ExportTool.MultiThreadedRunner(url);
- absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".json";
+ absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".jsonl";
info.setOutFormat(absolutePath, "jsonl", false);
info.setLimit("-1");
info.fields = "id,desc_s";
@@ -115,6 +115,24 @@ public class TestExportTool extends SolrCloudTestCase {
info.exportDocs();
assertJavabinDocsCount(info, 1000);
+ info = new ExportTool.MultiThreadedRunner(url);
+ absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".json";
+ info.setOutFormat(absolutePath, "json", false);
+ info.setLimit("200");
+ info.fields = "id,desc_s";
+ info.exportDocs();
+
+ assertJsonDocsCount2(info, 200);
+
+ info = new ExportTool.MultiThreadedRunner(url);
+ absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".json";
+ info.setOutFormat(absolutePath, "json", false);
+ info.setLimit("-1");
+ info.fields = "id,desc_s";
+ info.exportDocs();
+
+ assertJsonDocsCount2(info, 1000);
+
} finally {
cluster.shutdown();
}
@@ -161,7 +179,7 @@ public class TestExportTool extends SolrCloudTestCase {
long totalDocsFromCores = 0;
for (Slice slice : coll.getSlices()) {
Replica replica = slice.getLeader();
- try (SolrClient client = new HttpSolrClient.Builder(replica.getBaseUrl()).build()) {
+ try (SolrClient client = new Http2SolrClient.Builder(replica.getBaseUrl()).build()) {
long count = ExportTool.getDocCount(replica.getCoreName(), client);
docCounts.put(replica.getCoreName(), count);
totalDocsFromCores += count;
@@ -185,7 +203,7 @@ public class TestExportTool extends SolrCloudTestCase {
}
info = new ExportTool.MultiThreadedRunner(url);
info.output = System.out;
- absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".json";
+ absolutePath = tmpFileLoc + COLLECTION_NAME + random().nextInt(100000) + ".jsonl";
info.setOutFormat(absolutePath, "jsonl", false);
info.fields = "id,desc_s";
info.setLimit("-1");
@@ -217,6 +235,11 @@ public class TestExportTool extends SolrCloudTestCase {
}
}
+ private void assertJsonDocsCount2(ExportTool.Info info, int expected) {
+ assertTrue(
+ "" + info.docsWritten.get() + " expected " + expected, info.docsWritten.get() >= expected);
+ }
+
private void assertJsonDocsCount(
ExportTool.Info info, int expected, Predicate<Map<String, Object>> predicate)
throws IOException {
diff --git a/solr/packaging/test/test_export.bats b/solr/packaging/test/test_export.bats
index 53f5180fcea..0e9aba9aef5 100644
--- a/solr/packaging/test/test_export.bats
+++ b/solr/packaging/test/test_export.bats
@@ -35,11 +35,11 @@ teardown() {
refute_output --partial 'Unrecognized option'
assert_output --partial 'Export complete'
- assert [ -e ${BATS_TEST_TMPDIR}/output.jsonl ]
+ assert [ -e ${BATS_TEST_TMPDIR}/output.json ]
run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test"
- assert [ -e techproducts.jsonl ]
- rm techproducts.jsonl
+ assert [ -e techproducts.json ]
+ rm techproducts.json
run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -format javabin
assert [ -e techproducts.javabin ]
@@ -47,10 +47,10 @@ teardown() {
# old pattern of putting a suffix on the out that controlled the format no longer supported ;-).
run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -out "${BATS_TEST_TMPDIR}/output.javabin"
- assert [ -e ${BATS_TEST_TMPDIR}/output.javabin.jsonl ]
+ assert [ -e ${BATS_TEST_TMPDIR}/output.javabin.json ]
run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -out "${BATS_TEST_TMPDIR}"
- assert [ -e ${BATS_TEST_TMPDIR}/techproducts.jsonl ]
+ assert [ -e ${BATS_TEST_TMPDIR}/techproducts.json ]
run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -format jsonl -out "${BATS_TEST_TMPDIR}/output"
assert [ -e ${BATS_TEST_TMPDIR}/output.jsonl ]
@@ -58,12 +58,6 @@ teardown() {
run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -compress -format jsonl -out "${BATS_TEST_TMPDIR}/output"
assert [ -e ${BATS_TEST_TMPDIR}/output.jsonl.gz ]
- # Confirm we don't properly support json right now.
- run solr export -url "http://localhost:8983/solr/techproducts" -query "*:* -id:test" -format json -out "${BATS_TEST_TMPDIR}/output.json"
- assert_output --partial 'format must be one of:'
- refute [ -e ${BATS_TEST_TMPDIR}/output.json ]
-
-
}
@test "export fails on non cloud mode" {
diff --git a/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc b/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc
index 72495320aa6..9d92011663f 100644
--- a/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc
+++ b/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc
@@ -1381,7 +1381,7 @@ Examples of this command:
=== Exporting Documents to a File
-The `export` command will allow you to export documents from a collection in either JSON or Javabin format.
+The `export` command will allow you to export documents from a collection in JSON, JSON with Lines, or Javabin format.
All documents can be exported, or only those that match a query.
NOTE: This hasn't been tested with nested child documents and your results will vary.
@@ -1409,12 +1409,12 @@ Fully-qualified address to a collection.
+
[%autowidth,frame=none]
|===
-|Optional |Default: `jsonl`
+|Optional |Default: `json`
|===
+
-The file format of the export, `jsonl` or `javabin`.
-Choosing `javabin` exports in the native Solr format.
-This is compact and faster to import.
+The file format of the export, `json`, `jsonl`, or `javabin`.
+Choosing `javabin` exports in the native Solr format, and is compact and fast to import.
+`jsonl` is the Json with Lines format, learn more at https://jsonlines.org/.
`-out`::
+
@@ -1425,7 +1425,7 @@ This is compact and faster to import.
+
Either the path to a directory for the export to happen in, or a specific file name to populate.
+
-If not provided, a file will be created with the name of the collection, as in `<collection>.jsonl`.
+If not provided, a file will be created with the name of the collection, as in `<collection>.json`.
`-compress`::
+
@@ -1434,7 +1434,7 @@ If not provided, a file will be created with the name of the collection, as in `
|Optional |Default: false
|===
+
-If you specify `-compress` then the resulting outputting file with will be gzipped, for example `<collection>.jsonl.gz`.
+If you specify `-compress` then the resulting outputting file with will be gzipped, for example `<collection>.json.gz`.
`-query`::
+
@@ -1473,24 +1473,52 @@ Export all documents from a collection `gettingstarted`:
[source,bash]
bin/solr export -url http://localhost:8983/solr/gettingstarted limit -1
-Export all documents of collection `gettingstarted` into a file called `1MDocs.json.gz` as a compressed JSONL file:
+Export all documents of collection `gettingstarted` into a file called `1MDocs.json.gz` as a compressed JSON file:
[source,bash]
----
-bin/solr export -url http://localhost:8983/solr/gettingstarted -1 -format jsonl -compress -out 1MDocs
+bin/solr export -url http://localhost:8983/solr/gettingstarted -1 -format json -compress -out 1MDocs
----
=== Importing Documents into a Collection
Once you have exported documents in a file, you can use the xref:indexing-guide:indexing-with-update-handlers.adoc[/update request handler] to import them to a new Solr collection.
-*Example: import `jsonl` files*
+*Example: import `json` files*
-`curl -X POST -d @gettingstarted.json http://localhost:8983/solr/gettingstarted/update/json/docs?commit=true`
+First export the documents, making sure to ignore any fields that are populated via a `copyField` by specifying what fields you want to export:
+
+[,console]
+----
+$ bin/solr export -url http://localhost:8983/solr/gettingstarted -fields id,name,manu,cat,features
+----
+
+Create a new collection to import the exported documents into:
+
+[,console]
+----
+$ bin/solr create_collection -c test_collection -n techproducts
+----
+
+Now import the data with either of these methods:
+
+[,console]
+----
+$ curl -X POST -d @gettingstarted.json 'http://localhost:8983/solr/test_collection/update/json/docs?commit=true'
+----
+or
+[,console]
+----
+$ curl -H 'Content-Type: application/json' -X POST -d @gettingstarted.json 'http://localhost:8983/solr/test_collection/update?commit=true'
+----
*Example: import `javabin` files*
-`curl -X POST --header "Content-Type: application/javabin" --data-binary @gettingstarted.javabin http://localhost:8983/solr/gettingstarted/update?commit=true`
+[,console]
+----
+$ bin/solr export -url http://localhost:8983/solr/gettingstarted -format javabin -fields id,name,manu,cat,features
+$ curl -X POST --header "Content-Type: application/javabin" --data-binary @gettingstarted.javabin 'http://localhost:8983/solr/test_collection/update?commit=true'
+----
== Interacting with API