You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@beam.apache.org by lc...@apache.org on 2017/04/26 17:56:10 UTC
[1/2] beam git commit: [BEAM-2060] Add charset support in XmlSink
Repository: beam
Updated Branches:
refs/heads/master c5cf90c70 -> 9ff22a4ef
[BEAM-2060] Add charset support in XmlSink
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/43647471
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/43647471
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/43647471
Branch: refs/heads/master
Commit: 43647471f9f86b168c30958ef83a0947b5d4eb56
Parents: c5cf90c
Author: Jean-Baptiste Onofr� <jb...@apache.org>
Authored: Wed Apr 26 16:09:54 2017 +0200
Committer: Luke Cwik <lc...@google.com>
Committed: Wed Apr 26 10:54:58 2017 -0700
----------------------------------------------------------------------
.../java/org/apache/beam/sdk/io/xml/XmlIO.java | 27 +++++++++++++++--
.../org/apache/beam/sdk/io/xml/XmlSink.java | 2 +-
.../org/apache/beam/sdk/io/xml/XmlSinkTest.java | 31 +++++++++++++++++---
3 files changed, 53 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/43647471/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java b/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java
index f8a9edc..ce36abe 100644
--- a/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java
+++ b/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java
@@ -211,10 +211,20 @@ public class XmlIO {
* ...
* </words>
* }</pre>
+ *
+ * <p>By default the UTF-8 charset is used. This can be overridden, for example:
+ *
+ * <pre>{@code
+ * p.apply(XmlIO.<Type>write()
+ * .withRecordClass(Type.class)
+ * .withRootElement(root_element)
+ * .withCharset(StandardCharsets.ISO_8859_1)
+ * .toFilenamePrefix(output_filename));
+ * }</pre>
*/
// CHECKSTYLE.ON: JavadocStyle
public static <T> Write<T> write() {
- return new AutoValue_XmlIO_Write.Builder<T>().build();
+ return new AutoValue_XmlIO_Write.Builder<T>().setCharset("UTF-8").build();
}
/** Implementation of {@link #read}. */
@@ -432,6 +442,9 @@ public class XmlIO {
@Nullable
abstract String getRootElement();
+ @Nullable
+ abstract String getCharset();
+
abstract Builder<T> toBuilder();
@AutoValue.Builder
@@ -442,6 +455,8 @@ public class XmlIO {
abstract Builder<T> setRootElement(String rootElement);
+ abstract Builder<T> setCharset(String charset);
+
abstract Write<T> build();
}
@@ -469,11 +484,17 @@ public class XmlIO {
return toBuilder().setRootElement(rootElement).build();
}
+ /** Sets the charset used to write the file. */
+ public Write<T> withCharset(Charset charset) {
+ return toBuilder().setCharset(charset.name()).build();
+ }
+
@Override
public void validate(PCollection<T> input) {
checkNotNull(getRecordClass(), "Missing a class to bind to a JAXB context.");
checkNotNull(getRootElement(), "Missing a root element name.");
checkNotNull(getFilenamePrefix(), "Missing a filename to write to.");
+ checkNotNull(getCharset(), "Missing charset");
try {
JAXBContext.newInstance(getRecordClass());
} catch (JAXBException e) {
@@ -498,7 +519,9 @@ public class XmlIO {
.addIfNotNull(
DisplayData.item("rootElement", getRootElement()).withLabel("XML Root Element"))
.addIfNotNull(
- DisplayData.item("recordClass", getRecordClass()).withLabel("XML Record Class"));
+ DisplayData.item("recordClass", getRecordClass()).withLabel("XML Record Class"))
+ .addIfNotNull(
+ DisplayData.item("charset", getCharset()).withLabel("Charset"));
}
}
}
http://git-wip-us.apache.org/repos/asf/beam/blob/43647471/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlSink.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlSink.java b/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlSink.java
index 2e7dba1..a1ebf6c 100644
--- a/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlSink.java
+++ b/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlSink.java
@@ -85,7 +85,7 @@ class XmlSink<T> extends FileBasedSink<T> {
marshaller = context.createMarshaller();
marshaller.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, Boolean.TRUE);
marshaller.setProperty(Marshaller.JAXB_FRAGMENT, Boolean.TRUE);
- marshaller.setProperty(Marshaller.JAXB_ENCODING, "UTF-8");
+ marshaller.setProperty(Marshaller.JAXB_ENCODING, getSink().spec.getCharset());
return new XmlWriter<>(this, marshaller);
}
http://git-wip-us.apache.org/repos/asf/beam/blob/43647471/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSinkTest.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSinkTest.java b/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSinkTest.java
index a6e1b87..bf15cfe 100644
--- a/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSinkTest.java
+++ b/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSinkTest.java
@@ -26,9 +26,11 @@ import static org.junit.Assert.assertNotNull;
import com.google.common.collect.Lists;
import java.io.BufferedReader;
import java.io.File;
+import java.io.FileInputStream;
import java.io.FileOutputStream;
-import java.io.FileReader;
+import java.io.InputStreamReader;
import java.nio.channels.WritableByteChannel;
+import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
@@ -82,7 +84,26 @@ public class XmlSinkTest {
List<String> lines = Arrays.asList("<birds>", "<bird>", "<species>robin</species>",
"<adjective>bemused</adjective>", "</bird>", "<bird>", "<species>goose</species>",
"<adjective>evasive</adjective>", "</bird>", "</birds>");
- runTestWrite(writer, bundle, lines);
+ runTestWrite(writer, bundle, lines, StandardCharsets.UTF_8.name());
+ }
+
+ @Test
+ public void testXmlWriterCharset() throws Exception {
+ PipelineOptions options = PipelineOptionsFactory.create();
+ XmlWriteOperation<Bird> writeOp =
+ XmlIO.<Bird>write()
+ .toFilenamePrefix(testFilePrefix)
+ .withRecordClass(Bird.class)
+ .withRootElement("birds")
+ .withCharset(StandardCharsets.ISO_8859_1)
+ .createSink()
+ .createWriteOperation(options);
+ XmlWriter<Bird> writer = writeOp.createWriter(options);
+
+ List<Bird> bundle = Lists.newArrayList(new Bird("br�che", "pin�on"));
+ List<String> lines = Arrays.asList("<birds>", "<bird>", "<species>pin�on</species>",
+ "<adjective>br�che</adjective>", "</bird>", "</birds>");
+ runTestWrite(writer, bundle, lines, StandardCharsets.ISO_8859_1.name());
}
/**
@@ -181,14 +202,16 @@ public class XmlSinkTest {
/**
* Write a bundle with an XmlWriter and verify the output is expected.
*/
- private <T> void runTestWrite(XmlWriter<T> writer, List<T> bundle, List<String> expected)
+ private <T> void runTestWrite(XmlWriter<T> writer, List<T> bundle, List<String> expected,
+ String charset)
throws Exception {
File tmpFile = tmpFolder.newFile("foo.txt");
try (FileOutputStream fileOutputStream = new FileOutputStream(tmpFile)) {
writeBundle(writer, bundle, fileOutputStream.getChannel());
}
List<String> lines = new ArrayList<>();
- try (BufferedReader reader = new BufferedReader(new FileReader(tmpFile))) {
+ try (BufferedReader reader = new BufferedReader(
+ new InputStreamReader(new FileInputStream(tmpFile), charset))) {
for (;;) {
String line = reader.readLine();
if (line == null) {
[2/2] beam git commit: [BEAM-2060] Add charset support in XmlSink
Posted by lc...@apache.org.
[BEAM-2060] Add charset support in XmlSink
This closes #2702
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/9ff22a4e
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/9ff22a4e
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/9ff22a4e
Branch: refs/heads/master
Commit: 9ff22a4efcfdd7879ea6677d39c3a7f77aa3a232
Parents: c5cf90c 4364747
Author: Luke Cwik <lc...@google.com>
Authored: Wed Apr 26 10:56:01 2017 -0700
Committer: Luke Cwik <lc...@google.com>
Committed: Wed Apr 26 10:56:01 2017 -0700
----------------------------------------------------------------------
.../java/org/apache/beam/sdk/io/xml/XmlIO.java | 27 +++++++++++++++--
.../org/apache/beam/sdk/io/xml/XmlSink.java | 2 +-
.../org/apache/beam/sdk/io/xml/XmlSinkTest.java | 31 +++++++++++++++++---
3 files changed, 53 insertions(+), 7 deletions(-)
----------------------------------------------------------------------