You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@sqoop.apache.org by "Markus Kemper (JIRA)" <ji...@apache.org> on 2017/08/31 17:40:00 UTC
[jira] [Commented] (SQOOP-3014) Sqoop with HCatalog import loose precision for large numbers that does not fit into double

    [ https://issues.apache.org/jira/browse/SQOOP-3014?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16149309#comment-16149309 ] 

Markus Kemper commented on SQOOP-3014:
--------------------------------------

Adding additional test case (below) with the following comments:
* Import into HDFS seems to be working as expected
* The hcatalog load phase is not working as noted by others
** The issue does not appear to be specific to HDFS ORC files

*Test Case*
{noformat}
#######################
# STEP 01 - CREATE SQL TABLE AND DATA
#######################

export MYCONN=jdbc:oracle:thin:@sqoop.apache.com:1521/db11g
export MYUSER=sqoop
export MYPSWD=sqoop
sqoop list-tables --connect $MYCONN --username $MYUSER --password $MYPSWD

sqoop eval --connect $MYCONN --username $MYUSER --password $MYPSWD --query "drop table sqoop_3014"
sqoop eval --connect $MYCONN --username $MYUSER --password $MYPSWD --query "create table sqoop_3014 (c1 integer, c2 decimal(22,5), c3 varchar(40))"
sqoop eval --connect $MYCONN --username $MYUSER --password $MYPSWD --query "insert into sqoop_3014 values (1, 454018528782.42006329, '454018528782.42006329')"
sqoop eval --connect $MYCONN --username $MYUSER --password $MYPSWD --query "insert into sqoop_3014 values (2, 87658675864540185.123456789123456789, '87658675864540185.123456789123456789')"
sqoop eval --connect $MYCONN --username $MYUSER --password $MYPSWD --query "select * from sqoop_3014"

----------------------------------------------------------------------
| C1                   | C2                   | C3                   | 
----------------------------------------------------------------------
| 1                    | 454018528782.42006   | 454018528782.42006329 | 
| 2                    | 87658675864540185.12346 | 87658675864540185.123456789123456789 | 
----------------------------------------------------------------------

#######################
# STEP 02 - IMPORT DATA INTO HDFS (--as-textfile)
#######################

sqoop import --connect $MYCONN --username $MYUSER --password $MYPSWD --table SQOOP_3014 --target-dir /user/root/sqoop_3014 --delete-target-dir --num-mappers 1 --as-textfile --verbose
hdfs dfs -cat /user/root/sqoop_3014/part*

Output:
1,454018528782.42006,454018528782.42006329
2,87658675864540185.12346,87658675864540185.123456789123456789

sqoop import --connect $MYCONN --username $MYUSER --password $MYPSWD --query "select * from sqoop_3014 where \$CONDITIONS" --target-dir /user/root/sqoop_3014 --delete-target-dir --num-mappers 1 --as-textfile --verbose
hdfs dfs -cat /user/root/sqoop_3014/part*

Output:
1,454018528782.42006,454018528782.42006329
2,87658675864540185.12346,87658675864540185.123456789123456789

#######################
# STEP 03 - IMPORT DATA INTO HIVE (--hcatalog-storage-stanza "stored as textfile")
#######################

beeline -u jdbc:hive2:// -e "use db1; drop table sqoop_3014_text purge;"
sqoop import --connect $MYCONN --username $MYUSER --password $MYPSWD --query "select * from sqoop_3014 where \$CONDITIONS" --hcatalog-database db1 --hcatalog-table sqoop_3014_text --create-hcatalog-table --hcatalog-storage-stanza "stored as textfile" --num-mappers 1 --verbose
beeline -u jdbc:hive2:// -e "use db1; select * from sqoop_3014_text; show create table sqoop_3014_text;"

Output:
+---------------------+---------------------+---------------------------------------+--+
| sqoop_3014_text.c1  | sqoop_3014_text.c2  |          sqoop_3014_text.c3           |
+---------------------+---------------------+---------------------------------------+--+
| 1                   | 454018528782.42004  | 454018528782.42006329                 |
| 2                   | 87658675864540192   | 87658675864540185.123456789123456789  |
+---------------------+---------------------+---------------------------------------+--+

#######################
# STEP 04 - IMPORT DATA INTO HIVE (--hcatalog-storage-stanza "stored as orc")
#######################

beeline -u jdbc:hive2:// -e "use db1; drop table sqoop_3014_orc purge;"
sqoop import --connect $MYCONN --username $MYUSER --password $MYPSWD --query "select * from sqoop_3014 where \$CONDITIONS" --hcatalog-database db1 --hcatalog-table sqoop_3014_orc --create-hcatalog-table --hcatalog-storage-stanza "stored as orc" --num-mappers 1 --verbose
beeline -u jdbc:hive2:// -e "use db1; select * from sqoop_3014_orc; show create table sqoop_3014_orc;"

Output:
+---------------------+---------------------+---------------------------------------+--+
| sqoop_3014_text.c1  | sqoop_3014_text.c2  |          sqoop_3014_text.c3           |
+---------------------+---------------------+---------------------------------------+--+
| 1                   | 454018528782.42004  | 454018528782.42006329                 |
| 2                   | 87658675864540192   | 87658675864540185.123456789123456789  |
+---------------------+---------------------+---------------------------------------+--+
+----------------------------------------------------+--+
|                   createtab_stmt                   |
+----------------------------------------------------+--+
| CREATE TABLE `sqoop_3014_text`(                    |
|   `c1` decimal(38,0),                              |
|   `c2` decimal(22,5),                              |
|   `c3` varchar(40))                                |
| ROW FORMAT SERDE                                   |
|   'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'  |
| STORED AS INPUTFORMAT                              |
|   'org.apache.hadoop.mapred.TextInputFormat'       |
| OUTPUTFORMAT                                       |
|   'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' |
| LOCATION                                           |
|   'hdfs://nameservice1/data/dbs/db1/sqoop_3014_text' |
| TBLPROPERTIES (                                    |
|   'transient_lastDdlTime'='1504198953')            |
+----------------------------------------------------+--+
{noformat}

> Sqoop with HCatalog import loose precision for large numbers that does not fit into double
> ------------------------------------------------------------------------------------------
>
>                 Key: SQOOP-3014
>                 URL: https://issues.apache.org/jira/browse/SQOOP-3014
>             Project: Sqoop
>          Issue Type: Bug
>          Components: hive-integration
>    Affects Versions: 1.4.6
>            Reporter: Pavel Benes
>            Assignee: Venkat Ranganathan
>            Priority: Critical
>             Fix For: 1.4.7
>
>         Attachments: oracle-sqoop-error.png
>
>
> When using sqoop with HCatalog to import data from JDBC (I have tried Oracle11) all numbers that does not fit into double are loosing its precision or are distorted.
> Steps to reproduce:
> 1) Create test table in Oracle and fill it with test data
> {code}
> CREATE TABLE TEST_SQOOP_ERROR(ID VARCHAR(10), TYPE_NUMBER DECIMAL(22,5))
>  
> INSERT INTO MMDINGEST.TEST_SQOOP_ERROR(ID, TYPE_NUMBER) VALUES ( 'row-1', 454018528782.42006329)
> INSERT INTO MMDINGEST.TEST_SQOOP_ERROR(ID, TYPE_NUMBER) VALUES ( 'row-2', 87658675864540185.123456789123456789) 
> INSERT INTO MMDINGEST.TEST_SQOOP_ERROR(ID, TYPE_NUMBER) VALUES ( 'row-3', 87658675864540185.12345)
> INSERT INTO MMDINGEST.TEST_SQOOP_ERROR(ID, TYPE_NUMBER) VALUES ( 'row-4', 87658675864540185.123)
> INSERT INTO MMDINGEST.TEST_SQOOP_ERROR(ID, TYPE_NUMBER) VALUES ( 'row-5', 7658675864540185.12345) 
> INSERT INTO MMDINGEST.TEST_SQOOP_ERROR(ID, TYPE_NUMBER) VALUES ( 'row-6', 7658675864540185.123456789) 
> INSERT INTO MMDINGEST.TEST_SQOOP_ERROR(ID, TYPE_NUMBER) VALUES ( 'row-7', 658675864540185.12345)
> INSERT INTO MMDINGEST.TEST_SQOOP_ERROR(ID, TYPE_NUMBER) VALUES ( 'row-8', 58675864540185.12345)
> INSERT INTO MMDINGEST.TEST_SQOOP_ERROR(ID, TYPE_NUMBER) VALUES ( 'row-9', 8675864540185.12345) 
> INSERT INTO MMDINGEST.TEST_SQOOP_ERROR(ID, TYPE_NUMBER) VALUES ( 'row-10', 675864540185.12345) 
> INSERT INTO MMDINGEST.TEST_SQOOP_ERROR(ID, TYPE_NUMBER) VALUES ( 'row-11', 75864540185.12345) 
> INSERT INTO MMDINGEST.TEST_SQOOP_ERROR(ID, TYPE_NUMBER) VALUES ( 'row-12', 35864540185.12345) 
> INSERT INTO MMDINGEST.TEST_SQOOP_ERROR(ID, TYPE_NUMBER) VALUES ( 'row-13', 5864540185.12345) 
> {code}
> 2) Create table in Hive database
> {code}
> CREATE TABLE pbe_test_sqoop_error(id string, type_number decimal(22,5)) STORED AS ORC;
> {code}
> 3) Import data from Oracle to Hive using sqoop
> {code}
> export HADOOP_CLASSPATH=/opt/mantis/jdbc/oracle-11.2/ojdbc6.jar
> sqoop import -connect jdbc:oracle:thin:@//1.1.1.1:1521/XE --username  XXX --password  XXX --hcatalog-database  default --hcatalog-table pbe_test_sqoop_error  --driver oracle.jdbc.OracleDriver  --query 'SELECT id, type_number FROM MMDINGEST.TEST_SQOOP_ERROR WHERE $CONDITIONS' -m 1
> {code}
> 4) Display data from Hive table
> {code}
> hive> select * from pbe_test_sqoop_error;
> OK
> row-1   454018528782.42004
> row-2   87658675864540192
> row-3   87658675864540192
> row-4   87658675864540192
> row-5   7658675864540185
> row-6   7658675864540185
> row-7   658675864540185.125
> row-8   58675864540185.125
> row-9   8675864540185.12305
> row-10  675864540185.12341
> row-11  75864540185.12344
> row-12  35864540185.12345
> row-13  5864540185.12345
> Time taken: 0.455 seconds, Fetched: 13 row(s)
> {code}
> Only the values at line 1, 12, 13 are correct. At the lines 2-4 even the part of the number before dot is wrong. All looks correctly in Oracle as can be seen on the attached screenshot.
> The problem seems to be in the java class https://www.codatlas.com/github.com/apache/sqoop/branch-1.4.6/src/java/org/apache/sqoop/mapreduce/hcat/SqoopHCatImportHelper.java starting at line 437.
> {code}
>     } else if (hfsType == HCatFieldSchema.Type.DECIMAL) {
>       BigDecimal bd = new BigDecimal(n.doubleValue(),
>         MathContext.DECIMAL128);
>       return HiveDecimal.create(bd);
>     }
> {code}
> all numbers, even those that are stored in BigDecimal are squeezed through double which leads to the precision lost The same issue could be at some places when working with large numbers.
> The following code fixes this issue:
> {code}
> } else if (hfsType == HCatFieldSchema.Type.DECIMAL) {
>       BigDecimal bd = val instanceof BigDecimal ?  (BigDecimal) val : new BigDecimal(n.doubleValue(), MathContext.DECIMAL128);
>       return HiveDecimal.create(bd);
>     }
> {code} . 



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)