You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@beam.apache.org by lc...@apache.org on 2017/04/24 16:21:55 UTC

[1/3] beam git commit: [BEAM-2060] Use withCharset(Charset) for the user facing API

Repository: beam
Updated Branches:
  refs/heads/master 9c396826c -> d6dddee23


[BEAM-2060] Use withCharset(Charset) for the user facing API


Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/7c7ece30
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/7c7ece30
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/7c7ece30

Branch: refs/heads/master
Commit: 7c7ece300055667a076fee99defd62d1fd749232
Parents: ffc7781
Author: Jean-Baptiste Onofr� <jb...@apache.org>
Authored: Mon Apr 24 17:57:03 2017 +0200
Committer: Luke Cwik <lc...@google.com>
Committed: Mon Apr 24 09:21:20 2017 -0700

----------------------------------------------------------------------
 .../java/org/apache/beam/sdk/io/xml/XmlIO.java  | 21 ++++++-----------
 .../apache/beam/sdk/io/xml/XmlSourceTest.java   | 24 +-------------------
 2 files changed, 8 insertions(+), 37 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/beam/blob/7c7ece30/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java b/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java
index ef07925..f8a9edc 100644
--- a/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java
+++ b/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java
@@ -22,6 +22,8 @@ import static com.google.common.base.Preconditions.checkNotNull;
 import com.google.auto.value.AutoValue;
 import com.google.common.annotations.VisibleForTesting;
 
+import java.nio.charset.Charset;
+
 import javax.annotation.Nullable;
 import javax.xml.bind.JAXBContext;
 import javax.xml.bind.JAXBException;
@@ -79,7 +81,7 @@ public class XmlIO {
    * }</pre>
    *
    * <p>By default, UTF-8 charset is used. If your file is using a different charset, you have to
-   * specify as follow:
+   * specify the following:
    *
    * <pre>{@code
    * PCollection<String> output = p.apply(XmlIO.<Record>read()
@@ -87,19 +89,10 @@ public class XmlIO {
    *      .withRooElement("root")
    *      .withRecordElement("record")
    *      .withRecordClass(Record.class)
-   *      .withCharset("ISO-8859-1"));
+   *      .withCharset(StandardCharsets.ISO_8859_1));
    * }</pre>
    *
-   * <p>Or:
-   *
-   * <pre>{@code
-   * PCollection<String> output = p.apply(XmlIO.<Record>read()
-   *      .from(file.toPath().toString())
-   *      .withRooElement("root")
-   *      .withRecordElement("record")
-   *      .withRecordClass(Record.class)
-   *      .withCharset(StandardCharsets.ISO_8859_1.name()));
-   * }</pre>
+   * <p>{@link java.nio.charset.StandardCharsets} provides static references to common charsets.
    *
    * <p>Currently, only XML files that use single-byte characters are supported. Using a file that
    * contains multi-byte characters may result in data loss or duplication.
@@ -358,8 +351,8 @@ public class XmlIO {
     /**
      * Sets the XML file charset.
      */
-    public Read<T> withCharset(String charset) {
-      return toBuilder().setCharset(charset).build();
+    public Read<T> withCharset(Charset charset) {
+      return toBuilder().setCharset(charset.name()).build();
     }
 
     @Override

http://git-wip-us.apache.org/repos/asf/beam/blob/7c7ece30/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSourceTest.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSourceTest.java b/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSourceTest.java
index 9321ac3..3deee3e 100644
--- a/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSourceTest.java
+++ b/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSourceTest.java
@@ -611,29 +611,7 @@ public class XmlSourceTest {
                 .withRecordElement("train")
                 .withRecordClass(Train.class)
                 .withMinBundleSize(1024)
-                .withCharset(StandardCharsets.ISO_8859_1.name()));
-
-    List<Train> expectedResults =
-        ImmutableList.of(new Train("C�dric", 7, "blue", "small"));
-
-    PAssert.that(output).containsInAnyOrder(expectedResults);
-    p.run();
-  }
-
-  @Test
-  public void testReadXMLWithCharsetAsString() throws IOException {
-    File file = tempFolder.newFile("trainXMLISO88591");
-    Files.write(file.toPath(), trainXMLWithISO88591.getBytes(StandardCharsets.ISO_8859_1));
-
-    PCollection<Train> output =
-        p.apply("ReadFileData",
-            XmlIO.<Train>read()
-                .from(file.toPath().toString())
-                .withRootElement("trains")
-                .withRecordElement("train")
-                .withRecordClass(Train.class)
-                .withMinBundleSize(1024)
-                .withCharset("ISO-8859-1"));
+                .withCharset(StandardCharsets.ISO_8859_1));
 
     List<Train> expectedResults =
         ImmutableList.of(new Train("C�dric", 7, "blue", "small"));


[3/3] beam git commit: [BEAM-2060] Allow to specify charset in XmlIO

Posted by lc...@apache.org.
[BEAM-2060] Allow to specify charset in XmlIO

This closes #2660


Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/d6dddee2
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/d6dddee2
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/d6dddee2

Branch: refs/heads/master
Commit: d6dddee23d17ff215587f5e7d2b71260ff001196
Parents: 9c39682 7c7ece3
Author: Luke Cwik <lc...@google.com>
Authored: Mon Apr 24 09:21:47 2017 -0700
Committer: Luke Cwik <lc...@google.com>
Committed: Mon Apr 24 09:21:47 2017 -0700

----------------------------------------------------------------------
 .../java/org/apache/beam/sdk/io/xml/XmlIO.java  | 37 +++++++++++++++++++-
 .../org/apache/beam/sdk/io/xml/XmlSource.java   |  8 +++--
 .../apache/beam/sdk/io/xml/XmlSourceTest.java   | 27 ++++++++++++++
 3 files changed, 68 insertions(+), 4 deletions(-)
----------------------------------------------------------------------



[2/3] beam git commit: [BEAM-2060] Allow to specify charset in XmlIO

Posted by lc...@apache.org.
[BEAM-2060] Allow to specify charset in XmlIO


Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/ffc77813
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/ffc77813
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/ffc77813

Branch: refs/heads/master
Commit: ffc77813bb7883d894f65d1a70a88f6c2f56ca89
Parents: 9c39682
Author: Jean-Baptiste Onofr� <jb...@apache.org>
Authored: Mon Apr 24 16:37:40 2017 +0200
Committer: Luke Cwik <lc...@google.com>
Committed: Mon Apr 24 09:21:20 2017 -0700

----------------------------------------------------------------------
 .../java/org/apache/beam/sdk/io/xml/XmlIO.java  | 44 +++++++++++++++++-
 .../org/apache/beam/sdk/io/xml/XmlSource.java   |  8 ++--
 .../apache/beam/sdk/io/xml/XmlSourceTest.java   | 49 ++++++++++++++++++++
 3 files changed, 97 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/beam/blob/ffc77813/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java b/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java
index bf0e1b5..ef07925 100644
--- a/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java
+++ b/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java
@@ -21,6 +21,7 @@ import static com.google.common.base.Preconditions.checkNotNull;
 
 import com.google.auto.value.AutoValue;
 import com.google.common.annotations.VisibleForTesting;
+
 import javax.annotation.Nullable;
 import javax.xml.bind.JAXBContext;
 import javax.xml.bind.JAXBException;
@@ -77,6 +78,29 @@ public class XmlIO {
    *     .withRecordClass(Record.class));
    * }</pre>
    *
+   * <p>By default, UTF-8 charset is used. If your file is using a different charset, you have to
+   * specify as follow:
+   *
+   * <pre>{@code
+   * PCollection<String> output = p.apply(XmlIO.<Record>read()
+   *      .from(file.toPath().toString())
+   *      .withRooElement("root")
+   *      .withRecordElement("record")
+   *      .withRecordClass(Record.class)
+   *      .withCharset("ISO-8859-1"));
+   * }</pre>
+   *
+   * <p>Or:
+   *
+   * <pre>{@code
+   * PCollection<String> output = p.apply(XmlIO.<Record>read()
+   *      .from(file.toPath().toString())
+   *      .withRooElement("root")
+   *      .withRecordElement("record")
+   *      .withRecordClass(Record.class)
+   *      .withCharset(StandardCharsets.ISO_8859_1.name()));
+   * }</pre>
+   *
    * <p>Currently, only XML files that use single-byte characters are supported. Using a file that
    * contains multi-byte characters may result in data loss or duplication.
    *
@@ -94,6 +118,7 @@ public class XmlIO {
     return new AutoValue_XmlIO_Read.Builder<T>()
         .setMinBundleSize(Read.DEFAULT_MIN_BUNDLE_SIZE)
         .setCompressionType(Read.CompressionType.AUTO)
+        .setCharset("UTF-8")
         .build();
   }
 
@@ -220,6 +245,9 @@ public class XmlIO {
 
     abstract long getMinBundleSize();
 
+    @Nullable
+    abstract String getCharset();
+
     abstract Builder<T> toBuilder();
 
     @AutoValue.Builder
@@ -236,6 +264,8 @@ public class XmlIO {
 
       abstract Builder<T> setCompressionType(CompressionType compressionType);
 
+      abstract Builder<T> setCharset(String charset);
+
       abstract Read<T> build();
     }
 
@@ -325,6 +355,13 @@ public class XmlIO {
       return toBuilder().setCompressionType(compressionType).build();
     }
 
+    /**
+     * Sets the XML file charset.
+     */
+    public Read<T> withCharset(String charset) {
+      return toBuilder().setCharset(charset).build();
+    }
+
     @Override
     public void validate(PBegin input) {
       checkNotNull(
@@ -336,6 +373,9 @@ public class XmlIO {
       checkNotNull(
           getRecordClass(),
           "recordClass is null. Use builder method withRecordClass() to set this.");
+      checkNotNull(
+          getCharset(),
+          "charset is null. Use builder method withCharset() to set this.");
     }
 
     @Override
@@ -351,7 +391,9 @@ public class XmlIO {
           .addIfNotNull(
               DisplayData.item("recordElement", getRecordElement()).withLabel("XML Record Element"))
           .addIfNotNull(
-              DisplayData.item("recordClass", getRecordClass()).withLabel("XML Record Class"));
+              DisplayData.item("recordClass", getRecordClass()).withLabel("XML Record Class"))
+          .addIfNotNull(
+              DisplayData.item("charset", getCharset()).withLabel("Charset"));
     }
 
     @VisibleForTesting

http://git-wip-us.apache.org/repos/asf/beam/blob/ffc77813/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlSource.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlSource.java b/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlSource.java
index 876c782..1eb0e06 100644
--- a/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlSource.java
+++ b/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlSource.java
@@ -185,9 +185,11 @@ public class XmlSource<T> extends FileBasedSource<T> {
 
       byte[] dummyStartDocumentBytes =
           (String.format(
-                  "<?xml version=\"%s\" encoding=\"UTF-8\" ?><%s>",
+                  "<?xml version=\"%s\" encoding=\""
+                      + getCurrentSource().spec.getCharset()
+                      + "\"?><%s>",
                   XML_VERSION, getCurrentSource().spec.getRootElement()))
-              .getBytes(StandardCharsets.UTF_8);
+              .getBytes(getCurrentSource().spec.getCharset());
       preambleByteBuffer.write(dummyStartDocumentBytes);
       // Gets the byte offset (in the input file) of the first record in ReadableByteChannel. This
       // method returns the offset and stores any bytes that should be used when creating the XML
@@ -339,7 +341,7 @@ public class XmlSource<T> extends FileBasedSource<T> {
         this.parser = xmlInputFactory.createXMLStreamReader(
             new SequenceInputStream(
                 new ByteArrayInputStream(lookAhead), Channels.newInputStream(channel)),
-            "UTF-8");
+            getCurrentSource().spec.getCharset());
 
         // Current offset should be the offset before reading the record element.
         while (true) {

http://git-wip-us.apache.org/repos/asf/beam/blob/ffc77813/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSourceTest.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSourceTest.java b/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSourceTest.java
index 5b33be3..9321ac3 100644
--- a/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSourceTest.java
+++ b/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSourceTest.java
@@ -158,6 +158,11 @@ public class XmlSourceTest {
       + "</train>"
       + "</trains>";
 
+  String trainXMLWithISO88591 =
+      "<trains>"
+      + "<train size=\"small\"><name>C�dric</name><number>7</number><color>blue</color></train>"
+      + "</trains>";
+
   @XmlRootElement
   static class Train {
     public static final int TRAIN_NUMBER_UNDEFINED = -1;
@@ -594,6 +599,50 @@ public class XmlSourceTest {
   }
 
   @Test
+  public void testReadXMLWithCharset() throws IOException {
+    File file = tempFolder.newFile("trainXMLISO88591");
+    Files.write(file.toPath(), trainXMLWithISO88591.getBytes(StandardCharsets.ISO_8859_1));
+
+    PCollection<Train> output =
+        p.apply("ReadFileData",
+            XmlIO.<Train>read()
+                .from(file.toPath().toString())
+                .withRootElement("trains")
+                .withRecordElement("train")
+                .withRecordClass(Train.class)
+                .withMinBundleSize(1024)
+                .withCharset(StandardCharsets.ISO_8859_1.name()));
+
+    List<Train> expectedResults =
+        ImmutableList.of(new Train("C�dric", 7, "blue", "small"));
+
+    PAssert.that(output).containsInAnyOrder(expectedResults);
+    p.run();
+  }
+
+  @Test
+  public void testReadXMLWithCharsetAsString() throws IOException {
+    File file = tempFolder.newFile("trainXMLISO88591");
+    Files.write(file.toPath(), trainXMLWithISO88591.getBytes(StandardCharsets.ISO_8859_1));
+
+    PCollection<Train> output =
+        p.apply("ReadFileData",
+            XmlIO.<Train>read()
+                .from(file.toPath().toString())
+                .withRootElement("trains")
+                .withRecordElement("train")
+                .withRecordClass(Train.class)
+                .withMinBundleSize(1024)
+                .withCharset("ISO-8859-1"));
+
+    List<Train> expectedResults =
+        ImmutableList.of(new Train("C�dric", 7, "blue", "small"));
+
+    PAssert.that(output).containsInAnyOrder(expectedResults);
+    p.run();
+  }
+
+  @Test
   @Category(NeedsRunner.class)
   public void testReadXMLSmallPipeline() throws IOException {
     File file = tempFolder.newFile("trainXMLSmall");