You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by da...@apache.org on 2009/05/05 04:12:20 UTC
svn commit: r771522 - in /hadoop/pig/trunk: ./ src/org/apache/pig/data/
src/org/apache/pig/impl/logicalLayer/parser/
src/org/apache/pig/tools/parameters/
src/org/apache/pig/tools/pigscript/parser/ test/org/apache/pig/test/
Author: daijy
Date: Tue May 5 02:12:19 2009
New Revision: 771522
URL: http://svn.apache.org/viewvc?rev=771522&view=rev
Log:
PIG-774: Pig does not handle Chinese characters correctly
Added:
hadoop/pig/trunk/test/org/apache/pig/test/TestUTF8.java
Modified:
hadoop/pig/trunk/CHANGES.txt
hadoop/pig/trunk/build.xml
hadoop/pig/trunk/src/org/apache/pig/data/DataByteArray.java
hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/parser/QueryParser.jjt
hadoop/pig/trunk/src/org/apache/pig/tools/parameters/ParamLoader.jj
hadoop/pig/trunk/src/org/apache/pig/tools/parameters/PigFileParser.jj
hadoop/pig/trunk/src/org/apache/pig/tools/pigscript/parser/PigScriptParser.jj
Modified: hadoop/pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/CHANGES.txt?rev=771522&r1=771521&r2=771522&view=diff
==============================================================================
--- hadoop/pig/trunk/CHANGES.txt (original)
+++ hadoop/pig/trunk/CHANGES.txt Tue May 5 02:12:19 2009
@@ -58,6 +58,8 @@
PIG-789: Fix dump and illustrate to work with new multi-query feature
(hagleitn via gates).
+PIG-774: Pig does not handle Chinese characters (in both the parameter subsitution
+using -param_file or embedded in the Pig script) correctly (daijy)
Release 0.2.0
Modified: hadoop/pig/trunk/build.xml
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/build.xml?rev=771522&r1=771521&r2=771522&view=diff
==============================================================================
--- hadoop/pig/trunk/build.xml (original)
+++ hadoop/pig/trunk/build.xml Tue May 5 02:12:19 2009
@@ -44,7 +44,8 @@
<property name="build.docs" value="${build.dir}/docs" />
<property name="build.javadoc" value="${build.docs}/api" />
<property name="dist.dir" value="${build.dir}/${final.name}" />
- <property name="build.encoding" value="ISO-8859-1" />
+ <!-- property name="build.encoding" value="ISO-8859-1" / -->
+ <property name="build.encoding" value="UTF8" />
<!-- TODO with only one version of hadoop in the lib folder we do not need that anymore -->
<property name="hadoop.jarfile" value="hadoop18.jar" />
<property name="hbase.jarfile" value="hbase-0.18.1.jar" />
Modified: hadoop/pig/trunk/src/org/apache/pig/data/DataByteArray.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/data/DataByteArray.java?rev=771522&r1=771521&r2=771522&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/data/DataByteArray.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/data/DataByteArray.java Tue May 5 02:12:19 2009
@@ -19,6 +19,7 @@
import java.io.IOException;
import java.io.Serializable;
+import java.io.UnsupportedEncodingException;
import java.lang.StringBuilder;
import java.util.ArrayList;
import java.util.Collection;
@@ -96,7 +97,12 @@
* @param s String to make a byte array out of.
*/
public DataByteArray(String s) {
- mData = s.getBytes();
+ try {
+ mData = s.getBytes("UTF8");
+ } catch (UnsupportedEncodingException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
}
/**
@@ -137,7 +143,13 @@
@Override
public String toString() {
- return new String(mData);
+ String r=null;
+ try {
+ r = new String(mData, "UTF8");
+ } catch (Exception e) {
+ // TODO: handle exception
+ }
+ return r;
}
/**
Modified: hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/parser/QueryParser.jjt
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/parser/QueryParser.jjt?rev=771522&r1=771521&r2=771522&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/parser/QueryParser.jjt (original)
+++ hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/parser/QueryParser.jjt Tue May 5 02:12:19 2009
@@ -25,6 +25,7 @@
STATIC = false;
// Case is ignored in keywords
IGNORE_CASE = true;
+ JAVA_UNICODE_ESCAPE = true;
}
PARSER_BEGIN(QueryParser)
Modified: hadoop/pig/trunk/src/org/apache/pig/tools/parameters/ParamLoader.jj
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/tools/parameters/ParamLoader.jj?rev=771522&r1=771521&r2=771522&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/tools/parameters/ParamLoader.jj (original)
+++ hadoop/pig/trunk/src/org/apache/pig/tools/parameters/ParamLoader.jj Tue May 5 02:12:19 2009
@@ -24,6 +24,7 @@
options {
// Generate non-static functions
STATIC = false;
+ JAVA_UNICODE_ESCAPE = true;
}
PARSER_BEGIN(ParamLoader)
package org.apache.pig.tools.parameters;
Modified: hadoop/pig/trunk/src/org/apache/pig/tools/parameters/PigFileParser.jj
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/tools/parameters/PigFileParser.jj?rev=771522&r1=771521&r2=771522&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/tools/parameters/PigFileParser.jj (original)
+++ hadoop/pig/trunk/src/org/apache/pig/tools/parameters/PigFileParser.jj Tue May 5 02:12:19 2009
@@ -25,6 +25,7 @@
// Generate non-static functions
STATIC = false;
IGNORE_CASE = true;
+ JAVA_UNICODE_ESCAPE = true;
}
PARSER_BEGIN(PigFileParser)
package org.apache.pig.tools.parameters;
Modified: hadoop/pig/trunk/src/org/apache/pig/tools/pigscript/parser/PigScriptParser.jj
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/tools/pigscript/parser/PigScriptParser.jj?rev=771522&r1=771521&r2=771522&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/tools/pigscript/parser/PigScriptParser.jj (original)
+++ hadoop/pig/trunk/src/org/apache/pig/tools/pigscript/parser/PigScriptParser.jj Tue May 5 02:12:19 2009
@@ -23,6 +23,7 @@
STATIC = false;
// Case is ignored in keywords
IGNORE_CASE = true;
+ JAVA_UNICODE_ESCAPE = true;
}
PARSER_BEGIN(PigScriptParser)
Added: hadoop/pig/trunk/test/org/apache/pig/test/TestUTF8.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/test/org/apache/pig/test/TestUTF8.java?rev=771522&view=auto
==============================================================================
--- hadoop/pig/trunk/test/org/apache/pig/test/TestUTF8.java (added)
+++ hadoop/pig/trunk/test/org/apache/pig/test/TestUTF8.java Tue May 5 02:12:19 2009
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.test;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileReader;
+import java.io.InputStreamReader;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.Iterator;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.pig.ExecType;
+import org.apache.pig.PigServer;
+import org.apache.pig.builtin.PigStorage;
+import org.apache.pig.data.*;
+import org.apache.pig.tools.grunt.Grunt;
+import org.apache.pig.tools.parameters.ParameterSubstitutionPreprocessor;
+
+import junit.framework.TestCase;
+
+public class TestUTF8 extends TestCase {
+
+ MiniCluster cluster = MiniCluster.buildCluster();
+ private PigServer pigServer;
+
+ @Before
+ @Override
+ public void setUp() throws Exception{
+ pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
+ }
+
+ @Test
+ public void testPigStorage() throws Exception{
+
+ File f1 = File.createTempFile("tmp", "");
+ PrintWriter pw = new PrintWriter(f1, "UTF-8");
+ pw.println("ä¸æ");
+ pw.println("ã«ã»ãã");
+ pw.println("íêµì´");
+ pw.println("ภาษาà¹à¸à¸¢");
+ pw.close();
+
+ pigServer.registerQuery("a = load '" + Util.generateURI(f1.toString()) + "' using " + PigStorage.class.getName() + "();");
+ Iterator<Tuple> iter = pigServer.openIterator("a");
+
+ assertEquals(DataType.toString(iter.next().get(0)), "ä¸æ");
+ assertEquals(DataType.toString(iter.next().get(0)), "ã«ã»ãã");
+ assertEquals(DataType.toString(iter.next().get(0)), "íêµì´");
+ assertEquals(DataType.toString(iter.next().get(0)), "ภาษาà¹à¸à¸¢");
+
+ f1.delete();
+ }
+
+ @Test
+ public void testScriptParser() throws Throwable {
+
+ String strCmd = "--ä¸æ\n";
+
+ ByteArrayInputStream cmd = new ByteArrayInputStream(strCmd.getBytes("UTF-8"));
+ InputStreamReader reader = new InputStreamReader(cmd);
+
+ Grunt grunt = new Grunt(new BufferedReader(reader), pigServer.getPigContext());
+
+ grunt.exec();
+ }
+
+ @Test
+ public void testQueryParser() throws Exception{
+ File f1 = File.createTempFile("tmp", "");
+ PrintWriter pw = new PrintWriter(f1, "UTF-8");
+ pw.println("ä¸æ");
+ pw.close();
+
+ pigServer.registerQuery("a = load '" + Util.generateURI(f1.toString()) + "' using " + PigStorage.class.getName() + "();");
+ pigServer.registerQuery("b = filter a by $0 == 'ä¸æ';");
+ Iterator<Tuple> iter = pigServer.openIterator("a");
+
+ assertEquals(DataType.toString(iter.next().get(0)), "ä¸æ");
+
+ f1.delete();
+ }
+ @Test
+ public void testParamSubstitution() throws Exception{
+ File queryFile = File.createTempFile("query", "");
+ PrintWriter ps = new PrintWriter(queryFile);
+ ps.println("b = filter a by $0 == '$querystring';");
+ ps.close();
+
+ String[] arg = {"querystring='ä¸æ'"};
+
+ ParameterSubstitutionPreprocessor psp = new ParameterSubstitutionPreprocessor(50);
+ BufferedReader pigIStream = new BufferedReader(new FileReader(queryFile.toString()));
+ StringWriter pigOStream = new StringWriter();
+
+ psp.genSubstitutedFile(pigIStream , pigOStream , arg, null);
+
+ assertTrue(pigOStream.toString().contains("ä¸æ"));
+
+ queryFile.delete();
+ }
+}