You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@zeppelin.apache.org by pr...@apache.org on 2021/02/10 02:26:55 UTC
[zeppelin] branch master updated: ZEPPELIN-5231: Livy Interpreter doesn't support Japanese Character - Encoding Issue

This is an automated email from the ASF dual-hosted git repository.

prabhjyotsingh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/zeppelin.git


The following commit(s) were added to refs/heads/master by this push:
     new 0fc1d7b  ZEPPELIN-5231: Livy Interpreter doesn't support Japanese Character - Encoding Issue
0fc1d7b is described below

commit 0fc1d7ba2081ebeed5c45d87c27ca57727f54099
Author: Prabhjyot Singh <pr...@gmail.com>
AuthorDate: Mon Feb 8 09:11:06 2021 +0530

    ZEPPELIN-5231: Livy Interpreter doesn't support Japanese Character - Encoding Issue
    
    ### What is this PR for?
    Livy Interpreter doesn't support Japanese Character - Encoding Issue
    
    ### What type of PR is it?
    [Bug Fix]
    
    ### What is the Jira issue?
    * https://issues.apache.org/jira/browse/ZEPPELIN-5231
    
    ### How should this be tested?
    On a paragraph use below to generate data set:
    ```
    %livy.scala
    spark.sql("CREATE TABLE  test ( id int, name String, destination String)")
    spark.sql("INSERT INTO test VALUES ('1', 'みんく', 'みんく')")
    spark.sql("INSERT INTO test VALUES ('2', 'シュレヤン', 'シュレヤン')")
    spark.sql("INSERT INTO test VALUES ('3', 'サイ', 'サイ')")
    spark.sql("INSERT INTO test VALUES ('4', 'ツェッペリン', 'ツェッペリン')")
    spark.sql("INSERT INTO test VALUES ('5', 'テスト', 'テスト')")
    spark.sql("INSERT INTO test VALUES ('6', 'チャラン', 'チャラン')")
    spark.sql("select * from test").show()
    ```
    
    Then in a separate paragraph use this to verify:
    ```
    %livy.sql
    select * from test
    ```
    
    You would notice that on toggling (true/false) between the newly introduced `tableWithUTFCharacter` the same 2nd paragraph works.
    
    ### Screenshots (if appropriate)
    N/A
    
    ### Questions:
    * Does the licenses files need update?N/A
    * Is there breaking changes for older versions?N/A
    * Does this needs documentation?N/A
    
    Author: Prabhjyot Singh <pr...@gmail.com>
    
    Closes #4041 from prabhjyotsingh/ZEPPELIN-5231 and squashes the following commits:
    
    26b8c03d6 [Prabhjyot Singh] update doc
    0dd2ac837 [Prabhjyot Singh] fix indent
    25ede1b3d [Prabhjyot Singh] add unittests
    27e7ea15d [Prabhjyot Singh] ZEPPELIN-5231: add support for UTF-8 characters
    
    Change-Id: If3d62d8fea9a3e73cb1deaeeed8b872e83292035
---
 docs/interpreter/livy.md                           |   5 +
 .../zeppelin/livy/LivySparkSQLInterpreter.java     |  55 ++++++++++-
 livy/src/main/resources/interpreter-setting.json   |   6 ++
 .../zeppelin/livy/LivySQLInterpreterTest.java      | 104 +++++++++++++++++++++
 4 files changed, 167 insertions(+), 3 deletions(-)

diff --git a/docs/interpreter/livy.md b/docs/interpreter/livy.md
index c7a96ba..a7b13a1 100644
--- a/docs/interpreter/livy.md
+++ b/docs/interpreter/livy.md
@@ -175,6 +175,11 @@ Example: `spark.driver.memory` to `livy.spark.driver.memory`
     <td>key_1: value_1; key_2: value_2</td>
     <td>custom http headers when calling livy rest api. Each http header is separated by `;`, and each header is one key value pair where key value is separated by `:`</td>
   </tr>
+  <tr>
+    <td>zeppelin.livy.tableWithUTFCharacters</td>
+    <td>false</td>
+    <td>If database contains UTF characters then set this as true.</td>
+  </tr>
 </table>
 
 **We remove livy.spark.master in zeppelin-0.7. Because we sugguest user to use livy 0.3 in zeppelin-0.7. And livy 0.3 don't allow to specify livy.spark.master, it enfornce yarn-cluster mode.**
diff --git a/livy/src/main/java/org/apache/zeppelin/livy/LivySparkSQLInterpreter.java b/livy/src/main/java/org/apache/zeppelin/livy/LivySparkSQLInterpreter.java
index 4bf2e1e..4faf9c8 100644
--- a/livy/src/main/java/org/apache/zeppelin/livy/LivySparkSQLInterpreter.java
+++ b/livy/src/main/java/org/apache/zeppelin/livy/LivySparkSQLInterpreter.java
@@ -17,6 +17,11 @@
 
 package org.apache.zeppelin.livy;
 
+import com.google.gson.Gson;
+import com.google.gson.reflect.TypeToken;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.zeppelin.interpreter.InterpreterContext;
 import org.apache.zeppelin.interpreter.InterpreterException;
@@ -38,6 +43,7 @@ import static org.apache.commons.lang3.StringEscapeUtils.escapeEcmaScript;
  * Livy SparkSQL Interpreter for Zeppelin.
  */
 public class LivySparkSQLInterpreter extends BaseLivyInterpreter {
+
   public static final String ZEPPELIN_LIVY_SPARK_SQL_FIELD_TRUNCATE =
       "zeppelin.livy.spark.sql.field.truncate";
 
@@ -113,8 +119,17 @@ public class LivySparkSQLInterpreter extends BaseLivyInterpreter {
       // use triple quote so that we don't need to do string escape.
       String sqlQuery = null;
       if (isSpark2) {
-        sqlQuery = "spark.sql(\"\"\"" + line + "\"\"\").show(" + maxResult + ", " +
-            truncate + ")";
+        if (tableWithUTFCharacter()) {
+          sqlQuery = "val df = spark.sql(\"\"\"" + line + "\"\"\")\n"
+              + "for ( col <- df.columns ) {\n"
+              + "    print(col+\"\\t\")\n"
+              + "}\n"
+              + "println\n"
+              + "df.toJSON.take(" + maxResult + ").foreach(println)";
+        } else {
+          sqlQuery = "spark.sql(\"\"\"" + line + "\"\"\").show(" + maxResult + ", " +
+              truncate + ")";
+        }
       } else {
         sqlQuery = "sqlContext.sql(\"\"\"" + line + "\"\"\").show(" + maxResult + ", " +
             truncate + ")";
@@ -127,7 +142,12 @@ public class LivySparkSQLInterpreter extends BaseLivyInterpreter {
           // assumption is correct for now. Ideally livy should return table type. We may do it in
           // the future release of livy.
           if (message.getType() == InterpreterResult.Type.TEXT) {
-            List<String> rows = parseSQLOutput(message.getData());
+            List<String> rows;
+            if (tableWithUTFCharacter()) {
+              rows = parseSQLJsonOutput(message.getData());
+            } else {
+              rows = parseSQLOutput(message.getData());
+            }
             result2.add(InterpreterResult.Type.TABLE, StringUtils.join(rows, "\n"));
             if (rows.size() >= (maxResult + 1)) {
               result2.add(ResultMessages.getExceedsLimitRowsMessage(maxResult,
@@ -153,6 +173,30 @@ public class LivySparkSQLInterpreter extends BaseLivyInterpreter {
     return FormType.SIMPLE;
   }
 
+  protected List<String> parseSQLJsonOutput(String output) {
+    List<String> rows = new ArrayList<>();
+
+    String[] rowsOutput = output.split("(?<!\\\\)\\n");
+    String[] header = rowsOutput[1].split("\t");
+    List<String> cells = new ArrayList<>(Arrays.asList(header));
+    rows.add(StringUtils.join(cells, "\t"));
+
+    for (int i = 2; i < rowsOutput.length; i++) {
+      Map<String, String> retMap = new Gson().fromJson(
+          rowsOutput[i], new TypeToken<HashMap<String, String>>() {
+          }.getType()
+      );
+      cells = new ArrayList<>();
+      for (String s : header) {
+        cells.add(retMap.getOrDefault(s, "null")
+            .replace("\n", "\\n")
+            .replace("\t", "\\t"));
+      }
+      rows.add(StringUtils.join(cells, "\t"));
+    }
+    return rows;
+  }
+
   protected List<String> parseSQLOutput(String output) {
     List<String> rows = new ArrayList<>();
     // Get first line by breaking on \n. We can guarantee
@@ -206,6 +250,7 @@ public class LivySparkSQLInterpreter extends BaseLivyInterpreter {
    * Represent the start and end index of each cell.
    */
   private static class Pair {
+
     private int start;
     private int end;
 
@@ -219,6 +264,10 @@ public class LivySparkSQLInterpreter extends BaseLivyInterpreter {
     return Boolean.parseBoolean(getProperty("zeppelin.livy.concurrentSQL"));
   }
 
+  public boolean tableWithUTFCharacter() {
+    return Boolean.parseBoolean(getProperty("zeppelin.livy.tableWithUTFCharacter"));
+  }
+
   @Override
   public Scheduler getScheduler() {
     if (concurrentSQL()) {
diff --git a/livy/src/main/resources/interpreter-setting.json b/livy/src/main/resources/interpreter-setting.json
index 7aa61d6..f1278df 100644
--- a/livy/src/main/resources/interpreter-setting.json
+++ b/livy/src/main/resources/interpreter-setting.json
@@ -161,6 +161,12 @@
         "defaultValue": false,
         "description": "Execute multiple SQL concurrently if set true.",
         "type": "checkbox"
+      },
+      "zeppelin.livy.tableWithUTFCharacter": {
+        "propertyName": "zeppelin.livy.tableWithUTFCharacter",
+        "defaultValue": false,
+        "description": "If database contains UTF characters then set this as true.",
+        "type": "checkbox"
       }
     },
     "option": {
diff --git a/livy/src/test/java/org/apache/zeppelin/livy/LivySQLInterpreterTest.java b/livy/src/test/java/org/apache/zeppelin/livy/LivySQLInterpreterTest.java
index 8821a86..baa930f 100644
--- a/livy/src/test/java/org/apache/zeppelin/livy/LivySQLInterpreterTest.java
+++ b/livy/src/test/java/org/apache/zeppelin/livy/LivySQLInterpreterTest.java
@@ -171,4 +171,108 @@ public class LivySQLInterpreterTest {
     assertEquals("1\t\\ta", rows.get(1));
     assertEquals("2\t2b", rows.get(2));
   }
+
+  @Test
+  public void parseSQLJsonOutput() {
+    //  Empty sql output
+    //  id name
+    List<String> rows = sqlInterpreter.parseSQLJsonOutput("\nid\tname\n");
+    assertEquals(1, rows.size());
+    assertEquals("id\tname", rows.get(0));
+
+
+    //  sql output with 2 rows
+    //  id name
+    //  {"id":1,"name":"1a"}
+    //  {"id":2,"name":"2a"}
+    rows = sqlInterpreter.parseSQLJsonOutput("\nid\tname\n"
+        + "{\"id\":1,\"name\":\"1a\"}\n"
+        + "{\"id\":2,\"name\":\"2a\"}");
+    assertEquals(3, rows.size());
+    assertEquals("id\tname", rows.get(0));
+    assertEquals("1\t1a", rows.get(1));
+    assertEquals("2\t2a", rows.get(2));
+
+
+    //  sql output with 3 rows and showing "only showing top 3 rows"
+    //  id\tname
+    //  {"id":1,"name":"1a"}
+    //  {"id":2,"name":"2a"}
+    //  {"id":3,"name":"3a"}
+    //  only showing top 3 rows
+    rows = sqlInterpreter.parseSQLJsonOutput("\nid\tname\n"
+        + "{\"id\":1,\"name\":\"1a\"}\n"
+        + "{\"id\":2,\"name\":\"2a\"}\n"
+        + "{\"id\":3,\"name\":\"3a\"}");
+    assertEquals(4, rows.size());
+    assertEquals("id\tname", rows.get(0));
+    assertEquals("1\t1a", rows.get(1));
+    assertEquals("2\t2a", rows.get(2));
+    assertEquals("3\t3a", rows.get(3));
+
+
+    //  sql output with 1 rows and showing "only showing top 1 rows"
+    //  id
+    //  {"id":1}
+    //  {"id":2}
+    //  {"id":3}
+    //  only showing top 1 rows
+    rows = sqlInterpreter.parseSQLJsonOutput("\nid\n"
+        + "{\"id\":1}");
+    assertEquals(2, rows.size());
+    assertEquals("id", rows.get(0));
+    assertEquals("1", rows.get(1));
+
+
+
+    //  sql output with 3 rows, 3 columns, showing "only showing top 3 rows" with a line break in
+    //  the data
+    //  id name destination
+    //  {"id":1,"name":"1a","destination":"1b"}
+    //  {"id":2,"name":"2\na","destination":"2b"}
+    //  {"id":3,"name":"3a","destination":"3b"}
+    //  only showing top 3 rows
+    rows = sqlInterpreter.parseSQLJsonOutput("\nid\tname\tdestination\n"
+        + "{\"id\":1,\"name\":\"1a\",\"destination\":\"1b\"}\n"
+        + "{\"id\":2,\"name\":\"2\\na\",\"destination\":\"2b\"}\n"
+        + "{\"id\":3,\"name\":\"3a\",\"destination\":\"3b\"}");
+    assertEquals(4, rows.size());
+    assertEquals("id\tname\tdestination", rows.get(0));
+    assertEquals("1\t1a\t1b", rows.get(1));
+    assertEquals("2\t2\\na\t2b", rows.get(2));
+    assertEquals("3\t3a\t3b", rows.get(3));
+
+
+    //  sql output with 3 rows and one containing a tab
+    //  id name
+    //  {"id":1,"name":"1a"}
+    //  {"id":2,"name":"2\ta"}
+    //  {"id":3,"name":"3a"}
+    //  only showing top 3 rows
+    rows = sqlInterpreter.parseSQLJsonOutput("\nid\tname\n"
+        + "{\"id\":1,\"name\":\"1a\"}\n"
+        + "{\"id\":2,\"name\":\"2\ta\"}\n"
+        + "{\"id\":3,\"name\":\"3a\"}");
+    assertEquals(4, rows.size());
+    assertEquals("id\tname", rows.get(0));
+    assertEquals("1\t1a", rows.get(1));
+    assertEquals("2\t2\\ta", rows.get(2));
+    assertEquals("3\t3a", rows.get(3));
+
+    //  sql output with 3 rows and one containing a Japanese characters
+    //  id name
+    //  {"id":1,"name":"1a"}
+    //  {"id":2,"name":"みんく"}
+    //  {"id":3,"name":"3a"}
+    //  only showing top 3 rows
+    rows = sqlInterpreter.parseSQLJsonOutput("\nid\tname\n"
+        + "{\"id\":1,\"name\":\"1a\"}\n"
+        + "{\"id\":2,\"name\":\"みんく\"}\n"
+        + "{\"id\":3,\"name\":\"3a\"}");
+    assertEquals(4, rows.size());
+    assertEquals("id\tname", rows.get(0));
+    assertEquals("1\t1a", rows.get(1));
+    assertEquals("2\tみんく", rows.get(2));
+    assertEquals("3\t3a", rows.get(3));
+  }
 }