You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@beam.apache.org by lc...@apache.org on 2017/04/26 17:56:10 UTC

[1/2] beam git commit: [BEAM-2060] Add charset support in XmlSink

Repository: beam
Updated Branches:
  refs/heads/master c5cf90c70 -> 9ff22a4ef


[BEAM-2060] Add charset support in XmlSink


Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/43647471
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/43647471
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/43647471

Branch: refs/heads/master
Commit: 43647471f9f86b168c30958ef83a0947b5d4eb56
Parents: c5cf90c
Author: Jean-Baptiste Onofr� <jb...@apache.org>
Authored: Wed Apr 26 16:09:54 2017 +0200
Committer: Luke Cwik <lc...@google.com>
Committed: Wed Apr 26 10:54:58 2017 -0700

----------------------------------------------------------------------
 .../java/org/apache/beam/sdk/io/xml/XmlIO.java  | 27 +++++++++++++++--
 .../org/apache/beam/sdk/io/xml/XmlSink.java     |  2 +-
 .../org/apache/beam/sdk/io/xml/XmlSinkTest.java | 31 +++++++++++++++++---
 3 files changed, 53 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/beam/blob/43647471/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java b/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java
index f8a9edc..ce36abe 100644
--- a/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java
+++ b/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java
@@ -211,10 +211,20 @@ public class XmlIO {
    *  ...
    * </words>
    * }</pre>
+   *
+   * <p>By default the UTF-8 charset is used. This can be overridden, for example:
+   *
+   * <pre>{@code
+   * p.apply(XmlIO.<Type>write()
+   *      .withRecordClass(Type.class)
+   *      .withRootElement(root_element)
+   *      .withCharset(StandardCharsets.ISO_8859_1)
+   *      .toFilenamePrefix(output_filename));
+   * }</pre>
    */
   // CHECKSTYLE.ON: JavadocStyle
   public static <T> Write<T> write() {
-    return new AutoValue_XmlIO_Write.Builder<T>().build();
+    return new AutoValue_XmlIO_Write.Builder<T>().setCharset("UTF-8").build();
   }
 
   /** Implementation of {@link #read}. */
@@ -432,6 +442,9 @@ public class XmlIO {
     @Nullable
     abstract String getRootElement();
 
+    @Nullable
+    abstract String getCharset();
+
     abstract Builder<T> toBuilder();
 
     @AutoValue.Builder
@@ -442,6 +455,8 @@ public class XmlIO {
 
       abstract Builder<T> setRootElement(String rootElement);
 
+      abstract Builder<T> setCharset(String charset);
+
       abstract Write<T> build();
     }
 
@@ -469,11 +484,17 @@ public class XmlIO {
       return toBuilder().setRootElement(rootElement).build();
     }
 
+    /** Sets the charset used to write the file. */
+    public Write<T> withCharset(Charset charset) {
+      return toBuilder().setCharset(charset.name()).build();
+    }
+
     @Override
     public void validate(PCollection<T> input) {
       checkNotNull(getRecordClass(), "Missing a class to bind to a JAXB context.");
       checkNotNull(getRootElement(), "Missing a root element name.");
       checkNotNull(getFilenamePrefix(), "Missing a filename to write to.");
+      checkNotNull(getCharset(), "Missing charset");
       try {
         JAXBContext.newInstance(getRecordClass());
       } catch (JAXBException e) {
@@ -498,7 +519,9 @@ public class XmlIO {
           .addIfNotNull(
               DisplayData.item("rootElement", getRootElement()).withLabel("XML Root Element"))
           .addIfNotNull(
-              DisplayData.item("recordClass", getRecordClass()).withLabel("XML Record Class"));
+              DisplayData.item("recordClass", getRecordClass()).withLabel("XML Record Class"))
+          .addIfNotNull(
+              DisplayData.item("charset", getCharset()).withLabel("Charset"));
     }
   }
 }

http://git-wip-us.apache.org/repos/asf/beam/blob/43647471/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlSink.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlSink.java b/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlSink.java
index 2e7dba1..a1ebf6c 100644
--- a/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlSink.java
+++ b/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlSink.java
@@ -85,7 +85,7 @@ class XmlSink<T> extends FileBasedSink<T> {
       marshaller = context.createMarshaller();
       marshaller.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, Boolean.TRUE);
       marshaller.setProperty(Marshaller.JAXB_FRAGMENT, Boolean.TRUE);
-      marshaller.setProperty(Marshaller.JAXB_ENCODING, "UTF-8");
+      marshaller.setProperty(Marshaller.JAXB_ENCODING, getSink().spec.getCharset());
       return new XmlWriter<>(this, marshaller);
     }
 

http://git-wip-us.apache.org/repos/asf/beam/blob/43647471/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSinkTest.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSinkTest.java b/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSinkTest.java
index a6e1b87..bf15cfe 100644
--- a/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSinkTest.java
+++ b/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSinkTest.java
@@ -26,9 +26,11 @@ import static org.junit.Assert.assertNotNull;
 import com.google.common.collect.Lists;
 import java.io.BufferedReader;
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.FileOutputStream;
-import java.io.FileReader;
+import java.io.InputStreamReader;
 import java.nio.channels.WritableByteChannel;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -82,7 +84,26 @@ public class XmlSinkTest {
     List<String> lines = Arrays.asList("<birds>", "<bird>", "<species>robin</species>",
         "<adjective>bemused</adjective>", "</bird>", "<bird>", "<species>goose</species>",
         "<adjective>evasive</adjective>", "</bird>", "</birds>");
-    runTestWrite(writer, bundle, lines);
+    runTestWrite(writer, bundle, lines, StandardCharsets.UTF_8.name());
+  }
+
+  @Test
+  public void testXmlWriterCharset() throws Exception {
+    PipelineOptions options = PipelineOptionsFactory.create();
+    XmlWriteOperation<Bird> writeOp =
+        XmlIO.<Bird>write()
+            .toFilenamePrefix(testFilePrefix)
+            .withRecordClass(Bird.class)
+            .withRootElement("birds")
+            .withCharset(StandardCharsets.ISO_8859_1)
+            .createSink()
+            .createWriteOperation(options);
+    XmlWriter<Bird> writer = writeOp.createWriter(options);
+
+    List<Bird> bundle = Lists.newArrayList(new Bird("br�che", "pin�on"));
+    List<String> lines = Arrays.asList("<birds>", "<bird>", "<species>pin�on</species>",
+        "<adjective>br�che</adjective>", "</bird>", "</birds>");
+    runTestWrite(writer, bundle, lines, StandardCharsets.ISO_8859_1.name());
   }
 
   /**
@@ -181,14 +202,16 @@ public class XmlSinkTest {
   /**
    * Write a bundle with an XmlWriter and verify the output is expected.
    */
-  private <T> void runTestWrite(XmlWriter<T> writer, List<T> bundle, List<String> expected)
+  private <T> void runTestWrite(XmlWriter<T> writer, List<T> bundle, List<String> expected,
+                                String charset)
       throws Exception {
     File tmpFile = tmpFolder.newFile("foo.txt");
     try (FileOutputStream fileOutputStream = new FileOutputStream(tmpFile)) {
       writeBundle(writer, bundle, fileOutputStream.getChannel());
     }
     List<String> lines = new ArrayList<>();
-    try (BufferedReader reader = new BufferedReader(new FileReader(tmpFile))) {
+    try (BufferedReader reader = new BufferedReader(
+        new InputStreamReader(new FileInputStream(tmpFile), charset))) {
       for (;;) {
         String line = reader.readLine();
         if (line == null) {


[2/2] beam git commit: [BEAM-2060] Add charset support in XmlSink

Posted by lc...@apache.org.
[BEAM-2060] Add charset support in XmlSink

This closes #2702


Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/9ff22a4e
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/9ff22a4e
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/9ff22a4e

Branch: refs/heads/master
Commit: 9ff22a4efcfdd7879ea6677d39c3a7f77aa3a232
Parents: c5cf90c 4364747
Author: Luke Cwik <lc...@google.com>
Authored: Wed Apr 26 10:56:01 2017 -0700
Committer: Luke Cwik <lc...@google.com>
Committed: Wed Apr 26 10:56:01 2017 -0700

----------------------------------------------------------------------
 .../java/org/apache/beam/sdk/io/xml/XmlIO.java  | 27 +++++++++++++++--
 .../org/apache/beam/sdk/io/xml/XmlSink.java     |  2 +-
 .../org/apache/beam/sdk/io/xml/XmlSinkTest.java | 31 +++++++++++++++++---
 3 files changed, 53 insertions(+), 7 deletions(-)
----------------------------------------------------------------------