You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by da...@apache.org on 2009/05/05 04:12:20 UTC

svn commit: r771522 - in /hadoop/pig/trunk: ./ src/org/apache/pig/data/ src/org/apache/pig/impl/logicalLayer/parser/ src/org/apache/pig/tools/parameters/ src/org/apache/pig/tools/pigscript/parser/ test/org/apache/pig/test/

Author: daijy
Date: Tue May  5 02:12:19 2009
New Revision: 771522

URL: http://svn.apache.org/viewvc?rev=771522&view=rev
Log:
PIG-774: Pig does not handle Chinese characters correctly

Added:
    hadoop/pig/trunk/test/org/apache/pig/test/TestUTF8.java
Modified:
    hadoop/pig/trunk/CHANGES.txt
    hadoop/pig/trunk/build.xml
    hadoop/pig/trunk/src/org/apache/pig/data/DataByteArray.java
    hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/parser/QueryParser.jjt
    hadoop/pig/trunk/src/org/apache/pig/tools/parameters/ParamLoader.jj
    hadoop/pig/trunk/src/org/apache/pig/tools/parameters/PigFileParser.jj
    hadoop/pig/trunk/src/org/apache/pig/tools/pigscript/parser/PigScriptParser.jj

Modified: hadoop/pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/CHANGES.txt?rev=771522&r1=771521&r2=771522&view=diff
==============================================================================
--- hadoop/pig/trunk/CHANGES.txt (original)
+++ hadoop/pig/trunk/CHANGES.txt Tue May  5 02:12:19 2009
@@ -58,6 +58,8 @@
 PIG-789: Fix dump and illustrate to work with new multi-query feature 
 (hagleitn via gates).
 
+PIG-774: Pig does not handle Chinese characters (in both the parameter subsitution
+using -param_file or embedded in the Pig script) correctly (daijy)
 
 Release 0.2.0
 

Modified: hadoop/pig/trunk/build.xml
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/build.xml?rev=771522&r1=771521&r2=771522&view=diff
==============================================================================
--- hadoop/pig/trunk/build.xml (original)
+++ hadoop/pig/trunk/build.xml Tue May  5 02:12:19 2009
@@ -44,7 +44,8 @@
     <property name="build.docs" value="${build.dir}/docs" />
     <property name="build.javadoc" value="${build.docs}/api" />
     <property name="dist.dir" value="${build.dir}/${final.name}" />
-    <property name="build.encoding" value="ISO-8859-1" />
+    <!-- property name="build.encoding" value="ISO-8859-1" / -->
+    <property name="build.encoding" value="UTF8" />
     <!-- TODO with only one version of hadoop in the lib folder we do not need that anymore -->
     <property name="hadoop.jarfile" value="hadoop18.jar" />
     <property name="hbase.jarfile" value="hbase-0.18.1.jar" />

Modified: hadoop/pig/trunk/src/org/apache/pig/data/DataByteArray.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/data/DataByteArray.java?rev=771522&r1=771521&r2=771522&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/data/DataByteArray.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/data/DataByteArray.java Tue May  5 02:12:19 2009
@@ -19,6 +19,7 @@
 
 import java.io.IOException;
 import java.io.Serializable;
+import java.io.UnsupportedEncodingException;
 import java.lang.StringBuilder;
 import java.util.ArrayList;
 import java.util.Collection;
@@ -96,7 +97,12 @@
      * @param s String to make a byte array out of.
      */
     public DataByteArray(String s) {
-        mData = s.getBytes();
+        try {
+			mData = s.getBytes("UTF8");
+		} catch (UnsupportedEncodingException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
     }
 
     /**
@@ -137,7 +143,13 @@
 
     @Override
     public String toString() {
-        return new String(mData);
+        String r=null;
+    	try {
+			r = new String(mData, "UTF8");
+		} catch (Exception e) {
+			// TODO: handle exception
+		}
+		return r;
     }
 
     /**

Modified: hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/parser/QueryParser.jjt
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/parser/QueryParser.jjt?rev=771522&r1=771521&r2=771522&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/parser/QueryParser.jjt (original)
+++ hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/parser/QueryParser.jjt Tue May  5 02:12:19 2009
@@ -25,6 +25,7 @@
   STATIC = false;
   // Case is ignored in keywords
   IGNORE_CASE = true;
+  JAVA_UNICODE_ESCAPE = true;
 }
 
 PARSER_BEGIN(QueryParser)

Modified: hadoop/pig/trunk/src/org/apache/pig/tools/parameters/ParamLoader.jj
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/tools/parameters/ParamLoader.jj?rev=771522&r1=771521&r2=771522&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/tools/parameters/ParamLoader.jj (original)
+++ hadoop/pig/trunk/src/org/apache/pig/tools/parameters/ParamLoader.jj Tue May  5 02:12:19 2009
@@ -24,6 +24,7 @@
 options {
     // Generate non-static functions
     STATIC = false;
+    JAVA_UNICODE_ESCAPE = true;
 }
 PARSER_BEGIN(ParamLoader)
 package org.apache.pig.tools.parameters;

Modified: hadoop/pig/trunk/src/org/apache/pig/tools/parameters/PigFileParser.jj
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/tools/parameters/PigFileParser.jj?rev=771522&r1=771521&r2=771522&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/tools/parameters/PigFileParser.jj (original)
+++ hadoop/pig/trunk/src/org/apache/pig/tools/parameters/PigFileParser.jj Tue May  5 02:12:19 2009
@@ -25,6 +25,7 @@
     // Generate non-static functions
     STATIC = false;
   	IGNORE_CASE = true;
+    JAVA_UNICODE_ESCAPE = true;
 }
 PARSER_BEGIN(PigFileParser)
 package org.apache.pig.tools.parameters;

Modified: hadoop/pig/trunk/src/org/apache/pig/tools/pigscript/parser/PigScriptParser.jj
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/tools/pigscript/parser/PigScriptParser.jj?rev=771522&r1=771521&r2=771522&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/tools/pigscript/parser/PigScriptParser.jj (original)
+++ hadoop/pig/trunk/src/org/apache/pig/tools/pigscript/parser/PigScriptParser.jj Tue May  5 02:12:19 2009
@@ -23,6 +23,7 @@
   STATIC = false;
   // Case is ignored in keywords
   IGNORE_CASE = true;
+  JAVA_UNICODE_ESCAPE = true;
 }
 
 PARSER_BEGIN(PigScriptParser)

Added: hadoop/pig/trunk/test/org/apache/pig/test/TestUTF8.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/test/org/apache/pig/test/TestUTF8.java?rev=771522&view=auto
==============================================================================
--- hadoop/pig/trunk/test/org/apache/pig/test/TestUTF8.java (added)
+++ hadoop/pig/trunk/test/org/apache/pig/test/TestUTF8.java Tue May  5 02:12:19 2009
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.test;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileReader;
+import java.io.InputStreamReader;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.Iterator;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.pig.ExecType;
+import org.apache.pig.PigServer;
+import org.apache.pig.builtin.PigStorage;
+import org.apache.pig.data.*;
+import org.apache.pig.tools.grunt.Grunt;
+import org.apache.pig.tools.parameters.ParameterSubstitutionPreprocessor;
+
+import junit.framework.TestCase;
+
+public class TestUTF8 extends TestCase {
+    
+    MiniCluster cluster = MiniCluster.buildCluster();
+    private PigServer pigServer;
+
+    @Before
+    @Override
+    public void setUp() throws Exception{
+        pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
+    }
+    
+    @Test
+    public void testPigStorage() throws Exception{
+        
+        File f1 = File.createTempFile("tmp", "");
+        PrintWriter pw = new PrintWriter(f1, "UTF-8");
+        pw.println("中文");
+        pw.println("にほんご");
+        pw.println("한국어");
+        pw.println("ภาษาไทย");
+        pw.close();
+
+        pigServer.registerQuery("a = load '" + Util.generateURI(f1.toString()) + "' using " + PigStorage.class.getName() + "();");
+        Iterator<Tuple> iter  = pigServer.openIterator("a");
+        
+        assertEquals(DataType.toString(iter.next().get(0)), "中文");
+        assertEquals(DataType.toString(iter.next().get(0)), "にほんご");
+        assertEquals(DataType.toString(iter.next().get(0)), "한국어");
+        assertEquals(DataType.toString(iter.next().get(0)), "ภาษาไทย");
+        
+        f1.delete();
+    }
+    
+    @Test
+    public void testScriptParser() throws Throwable {
+
+        String strCmd = "--中文\n";
+
+        ByteArrayInputStream cmd = new ByteArrayInputStream(strCmd.getBytes("UTF-8"));
+        InputStreamReader reader = new InputStreamReader(cmd);
+
+        Grunt grunt = new Grunt(new BufferedReader(reader), pigServer.getPigContext());
+
+        grunt.exec();
+    }
+    
+    @Test
+    public void testQueryParser() throws Exception{
+    	File f1 = File.createTempFile("tmp", "");
+        PrintWriter pw = new PrintWriter(f1, "UTF-8");
+        pw.println("中文");
+        pw.close();
+        
+        pigServer.registerQuery("a = load '" + Util.generateURI(f1.toString()) + "' using " + PigStorage.class.getName() + "();");
+        pigServer.registerQuery("b =  filter a by $0 == '中文';");
+        Iterator<Tuple> iter  = pigServer.openIterator("a");
+        
+        assertEquals(DataType.toString(iter.next().get(0)), "中文");
+
+        f1.delete();
+    }
+    @Test
+    public void testParamSubstitution() throws Exception{
+    	File queryFile = File.createTempFile("query", "");
+        PrintWriter ps = new PrintWriter(queryFile);
+        ps.println("b = filter a by $0 == '$querystring';");
+        ps.close();
+    	
+        String[] arg = {"querystring='中文'"};
+        
+    	ParameterSubstitutionPreprocessor psp = new ParameterSubstitutionPreprocessor(50);
+        BufferedReader pigIStream = new BufferedReader(new FileReader(queryFile.toString()));
+        StringWriter pigOStream = new StringWriter();
+
+        psp.genSubstitutedFile(pigIStream , pigOStream , arg, null);
+        
+        assertTrue(pigOStream.toString().contains("中文"));
+        
+        queryFile.delete();
+    }
+}