How to log connection detail denied by firewall
Configure EM Express for Oracle 12.2
SQL> select dbms_xdb_config.getHttpsPort() from dual;
DBMS_XDB_CONFIG.GETHTTPSPORT()
------------------------------
0
SQL> select dbms_xdb.getHttpPort() from dual;
DBMS_XDB.GETHTTPPORT()
----------------------
0
SQL> exec dbms_xdb_config.sethttpsport(5500);
PL/SQL procedure successfully completed.
SQL> select dbms_xdb_config.getHttpsPort() from dual;
DBMS_XDB_CONFIG.GETHTTPSPORT()
------------------------------
5500
SQL> !lsnrctl status |grep -i http
(DESCRIPTION=(ADDRESS=(PROTOCOL=tcps)(HOST=vmxdb01.dbaglobe.com)(PORT=5500))(Security=(my_wallet_directory=/u01/app/oracle/admin/orcl/xdb_wallet))(Presentation=HTTP)(Session=RAW))
SQL> select dbms_xdb.getHttpPort() from dual;
DBMS_XDB.GETHTTPPORT()
----------------------
0
Use Sqoop to import into hive tables (source is a view)
total 0
lrwxrwxrwx 1 root root 35 Nov 9 13:49 ant-contrib-1.0b3.jar -> ../../../jars/ant-contrib-1.0b3.jar
lrwxrwxrwx 1 root root 40 Nov 9 13:49 ant-eclipse-1.0-jvm1.2.jar -> ../../../jars/ant-eclipse-1.0-jvm1.2.jar
lrwxrwxrwx 1 root root 41 Nov 9 13:42 avro-mapred-hadoop2.jar -> ../../../lib/avro/avro-mapred-hadoop2.jar
lrwxrwxrwx 1 root root 26 Nov 9 13:42 avro.jar -> ../../../lib/avro/avro.jar
lrwxrwxrwx 1 root root 35 Nov 9 13:49 commons-codec-1.4.jar -> ../../../jars/commons-codec-1.4.jar
lrwxrwxrwx 1 root root 40 Nov 9 13:49 commons-compress-1.4.1.jar -> ../../../jars/commons-compress-1.4.1.jar
lrwxrwxrwx 1 root root 32 Nov 9 13:49 commons-io-1.4.jar -> ../../../jars/commons-io-1.4.jar
lrwxrwxrwx 1 root root 36 Nov 9 13:49 commons-jexl-2.1.1.jar -> ../../../jars/commons-jexl-2.1.1.jar
lrwxrwxrwx 1 root root 35 Nov 9 13:49 commons-lang3-3.4.jar -> ../../../jars/commons-lang3-3.4.jar
lrwxrwxrwx 1 root root 39 Nov 9 13:49 commons-logging-1.1.3.jar -> ../../../jars/commons-logging-1.1.3.jar
lrwxrwxrwx 1 root root 30 Nov 9 13:49 fastutil-6.3.jar -> ../../../jars/fastutil-6.3.jar
lrwxrwxrwx 1 root root 33 Nov 9 13:49 hsqldb-1.8.0.10.jar -> ../../../jars/hsqldb-1.8.0.10.jar
lrwxrwxrwx 1 root root 43 Nov 9 13:49 jackson-annotations-2.3.1.jar -> ../../../jars/jackson-annotations-2.3.1.jar
lrwxrwxrwx 1 root root 36 Nov 9 13:49 jackson-core-2.3.1.jar -> ../../../jars/jackson-core-2.3.1.jar
lrwxrwxrwx 1 root root 40 Nov 9 13:49 jackson-core-asl-1.8.8.jar -> ../../../jars/jackson-core-asl-1.8.8.jar
lrwxrwxrwx 1 root root 40 Nov 9 13:49 jackson-databind-2.3.1.jar -> ../../../jars/jackson-databind-2.3.1.jar
lrwxrwxrwx 1 root root 42 Nov 9 13:49 jackson-mapper-asl-1.8.8.jar -> ../../../jars/jackson-mapper-asl-1.8.8.jar
lrwxrwxrwx 1 root root 36 Nov 9 13:42 kite-data-core.jar -> ../../../lib/kite/kite-data-core.jar
lrwxrwxrwx 1 root root 36 Nov 9 13:42 kite-data-hive.jar -> ../../../lib/kite/kite-data-hive.jar
lrwxrwxrwx 1 root root 41 Nov 9 13:42 kite-data-mapreduce.jar -> ../../../lib/kite/kite-data-mapreduce.jar
lrwxrwxrwx 1 root root 47 Nov 9 13:42 kite-hadoop-compatibility.jar -> ../../../lib/kite/kite-hadoop-compatibility.jar
lrwxrwxrwx 1 root root 29 Nov 9 13:49 opencsv-2.3.jar -> ../../../jars/opencsv-2.3.jar
lrwxrwxrwx 1 root root 31 Nov 9 13:49 paranamer-2.3.jar -> ../../../jars/paranamer-2.3.jar
lrwxrwxrwx 1 root root 37 Nov 9 13:42 parquet-avro.jar -> ../../../lib/parquet/parquet-avro.jar
lrwxrwxrwx 1 root root 39 Nov 9 13:42 parquet-column.jar -> ../../../lib/parquet/parquet-column.jar
lrwxrwxrwx 1 root root 39 Nov 9 13:42 parquet-common.jar -> ../../../lib/parquet/parquet-common.jar
lrwxrwxrwx 1 root root 41 Nov 9 13:42 parquet-encoding.jar -> ../../../lib/parquet/parquet-encoding.jar
lrwxrwxrwx 1 root root 39 Nov 9 13:42 parquet-format.jar -> ../../../lib/parquet/parquet-format.jar
lrwxrwxrwx 1 root root 39 Nov 9 13:42 parquet-hadoop.jar -> ../../../lib/parquet/parquet-hadoop.jar
lrwxrwxrwx 1 root root 40 Nov 9 13:42 parquet-jackson.jar -> ../../../lib/parquet/parquet-jackson.jar
lrwxrwxrwx 1 root root 33 Nov 9 13:49 slf4j-api-1.7.5.jar -> ../../../jars/slf4j-api-1.7.5.jar
lrwxrwxrwx 1 root root 37 Nov 9 13:49 snappy-java-1.0.4.1.jar -> ../../../jars/snappy-java-1.0.4.1.jar
lrwxrwxrwx 1 root root 24 Nov 9 13:49 xz-1.0.jar -> ../../../jars/xz-1.0.jar
[donghua@cdh-vm test_db-master]$
[donghua@cdh-vm test_db-master]$ sudo ln -s /usr/share/java/mysql-connector-java.jar /opt/cloudera/parcels/CDH/lib/sqoop/lib/
[sudo] password for donghua:
[donghua@cdh-vm test_db-master]$ readlink /opt/cloudera/parcels/CDH/lib/sqoop/lib/mysql-connector-java.jar
/usr/share/java/mysql-connector-java.jar
[donghua@cdh-vm test_db-master]$
MariaDB [(none)]> create user employee_user identified by 'password';
Query OK, 0 rows affected (0.07 sec)
MariaDB [(none)]> grant all on employees.* to employee_user;
Query OK, 0 rows affected (0.04 sec)
MariaDB [(none)]> show grants for employee_user;
+--------------------------------------------------------------------------------------------------------------+
| Grants for employee_user@% |
+--------------------------------------------------------------------------------------------------------------+
| GRANT USAGE ON *.* TO 'employee_user'@'%' IDENTIFIED BY PASSWORD '*2470C0C06DEE42FD1618BB99005ADCA2EC9D1E19' |
| GRANT ALL PRIVILEGES ON `employees`.* TO 'employee_user'@'%' |
+--------------------------------------------------------------------------------------------------------------+
2 rows in set (0.00 sec)
[donghua@cdh-vm test_db-master]$ sqoop list-databases --connect jdbc:mysql://cdh-vm.dbaglobe.com --username employee_user --password password
Warning: /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/bin/../lib/sqoop/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
18/01/26 23:32:56 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.13.1
18/01/26 23:32:56 WARN tool.BaseSqoopTool: Setting your password on the command-line is insecure. Consider using -P instead.
18/01/26 23:32:56 INFO manager.MySQLManager: Preparing to use a MySQL streaming resultset.
information_schema
employees
[donghua@cdh-vm test_db-master]$ sqoop list-tables --connect jdbc:mysql://cdh-vm.dbaglobe.com/employees --username employee_user --password password
Warning: /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/bin/../lib/sqoop/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
18/01/26 23:33:17 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.13.1
18/01/26 23:33:17 WARN tool.BaseSqoopTool: Setting your password on the command-line is insecure. Consider using -P instead.
18/01/26 23:33:17 INFO manager.MySQLManager: Preparing to use a MySQL streaming resultset.
current_dept_emp
departments
dept_emp
dept_emp_latest_date
dept_manager
employees
salaries
titles
[donghua@cdh-vm test_db-master]$ sqoop import --connect jdbc:mysql://cdh-vm.dbaglobe.com/employees --username employee_user --password password --table current_dept_emp
Warning: /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/bin/../lib/sqoop/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
18/01/26 23:37:48 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.13.1
18/01/26 23:37:48 WARN tool.BaseSqoopTool: Setting your password on the command-line is insecure. Consider using -P instead.
18/01/26 23:37:49 INFO manager.MySQLManager: Preparing to use a MySQL streaming resultset.
18/01/26 23:37:49 INFO tool.CodeGenTool: Beginning code generation
18/01/26 23:37:49 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `current_dept_emp` AS t LIMIT 1
18/01/26 23:37:49 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `current_dept_emp` AS t LIMIT 1
18/01/26 23:37:49 INFO orm.CompilationManager: HADOOP_MAPRED_HOME is /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce
Note: /tmp/sqoop-donghua/compile/f0cac41ee0eb9df573aa4341b36a671d/current_dept_emp.java uses or overrides a deprecated API.
Note: Recompile with -Xlint:deprecation for details.
18/01/26 23:37:51 INFO orm.CompilationManager: Writing jar file: /tmp/sqoop-donghua/compile/f0cac41ee0eb9df573aa4341b36a671d/current_dept_emp.jar
18/01/26 23:37:51 WARN manager.MySQLManager: It looks like you are importing from mysql.
18/01/26 23:37:51 WARN manager.MySQLManager: This transfer can be faster! Use the --direct
18/01/26 23:37:51 WARN manager.MySQLManager: option to exercise a MySQL-specific fast path.
18/01/26 23:37:51 INFO manager.MySQLManager: Setting zero DATETIME behavior to convertToNull (mysql)
18/01/26 23:37:51 ERROR tool.ImportTool: Import failed: No primary key could be found for table current_dept_emp. Please specify one with --split-by or perform a sequential import with '-m 1'.
[donghua@cdh-vm test_db-master]$ sqoop import --connect jdbc:mysql://cdh-vm.dbaglobe.com/employees --username employee_user --password password --table current_dept_emp -m 1
Warning: /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/bin/../lib/sqoop/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
18/01/26 23:38:08 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.13.1
18/01/26 23:38:08 WARN tool.BaseSqoopTool: Setting your password on the command-line is insecure. Consider using -P instead.
18/01/26 23:38:08 INFO manager.MySQLManager: Preparing to use a MySQL streaming resultset.
18/01/26 23:38:08 INFO tool.CodeGenTool: Beginning code generation
18/01/26 23:38:09 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `current_dept_emp` AS t LIMIT 1
18/01/26 23:38:09 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `current_dept_emp` AS t LIMIT 1
18/01/26 23:38:09 INFO orm.CompilationManager: HADOOP_MAPRED_HOME is /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce
Note: /tmp/sqoop-donghua/compile/3cb418ffe5487ad8ed8b36689ec598f4/current_dept_emp.java uses or overrides a deprecated API.
Note: Recompile with -Xlint:deprecation for details.
18/01/26 23:38:10 INFO orm.CompilationManager: Writing jar file: /tmp/sqoop-donghua/compile/3cb418ffe5487ad8ed8b36689ec598f4/current_dept_emp.jar
18/01/26 23:38:11 WARN manager.MySQLManager: It looks like you are importing from mysql.
18/01/26 23:38:11 WARN manager.MySQLManager: This transfer can be faster! Use the --direct
18/01/26 23:38:11 WARN manager.MySQLManager: option to exercise a MySQL-specific fast path.
18/01/26 23:38:11 INFO manager.MySQLManager: Setting zero DATETIME behavior to convertToNull (mysql)
18/01/26 23:38:11 INFO mapreduce.ImportJobBase: Beginning import of current_dept_emp
18/01/26 23:38:11 INFO Configuration.deprecation: mapred.jar is deprecated. Instead, use mapreduce.job.jar
18/01/26 23:38:12 INFO Configuration.deprecation: mapred.map.tasks is deprecated. Instead, use mapreduce.job.maps
18/01/26 23:38:12 INFO client.RMProxy: Connecting to ResourceManager at cdh-vm.dbaglobe.com/192.168.56.10:8032
18/01/26 23:38:17 INFO db.DBInputFormat: Using read commited transaction isolation
18/01/26 23:38:18 INFO mapreduce.JobSubmitter: number of splits:1
18/01/26 23:38:18 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1517023991003_0001
18/01/26 23:38:19 INFO impl.YarnClientImpl: Submitted application application_1517023991003_0001
18/01/26 23:38:19 INFO mapreduce.Job: The url to track the job: http://cdh-vm.dbaglobe.com:8088/proxy/application_1517023991003_0001/
18/01/26 23:38:19 INFO mapreduce.Job: Running job: job_1517023991003_0001
18/01/26 23:38:30 INFO mapreduce.Job: Job job_1517023991003_0001 running in uber mode : false
18/01/26 23:38:30 INFO mapreduce.Job: map 0% reduce 0%
18/01/26 23:38:42 INFO mapreduce.Job: map 100% reduce 0%
18/01/26 23:38:43 INFO mapreduce.Job: Job job_1517023991003_0001 completed successfully
18/01/26 23:38:43 INFO mapreduce.Job: Counters: 30
File System Counters
FILE: Number of bytes read=0
FILE: Number of bytes written=173876
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=87
HDFS: Number of bytes written=10110817
HDFS: Number of read operations=4
HDFS: Number of large read operations=0
HDFS: Number of write operations=2
Job Counters
Launched map tasks=1
Other local map tasks=1
Total time spent by all maps in occupied slots (ms)=8922
Total time spent by all reduces in occupied slots (ms)=0
Total time spent by all map tasks (ms)=8922
Total vcore-milliseconds taken by all map tasks=8922
Total megabyte-milliseconds taken by all map tasks=13704192
Map-Reduce Framework
Map input records=300024
Map output records=300024
Input split bytes=87
Spilled Records=0
Failed Shuffles=0
Merged Map outputs=0
GC time elapsed (ms)=109
CPU time spent (ms)=3330
Physical memory (bytes) snapshot=281448448
Virtual memory (bytes) snapshot=2788491264
Total committed heap usage (bytes)=246939648
File Input Format Counters
Bytes Read=0
File Output Format Counters
Bytes Written=10110817
18/01/26 23:38:43 INFO mapreduce.ImportJobBase: Transferred 9.6424 MB in 31.2284 seconds (316.1811 KB/sec)
18/01/26 23:38:43 INFO mapreduce.ImportJobBase: Retrieved 300024 records.
[donghua@cdh-vm test_db-master]$ hdfs dfs -ls
Found 3 items
drwx------ - donghua supergroup 0 2018-01-26 23:38 .staging
drwxr-xr-x - donghua supergroup 0 2018-01-26 23:38 current_dept_emp
-rw-r--r-- 1 donghua supergroup 15 2018-01-20 04:41 test.csv
[donghua@cdh-vm test_db-master]$
0: jdbc:hive2://localhost:10000/default> create database employees;
INFO : Compiling command(queryId=hive_20180126234646_4c4d2716-9d75-4786-8c31-1ee517688165): create database employees
INFO : Semantic Analysis Completed
INFO : Returning Hive schema: Schema(fieldSchemas:null, properties:null)
INFO : Completed compiling command(queryId=hive_20180126234646_4c4d2716-9d75-4786-8c31-1ee517688165); Time taken: 0.043 seconds
INFO : Executing command(queryId=hive_20180126234646_4c4d2716-9d75-4786-8c31-1ee517688165): create database employees
INFO : Starting task [Stage-0:DDL] in serial mode
INFO : Completed executing command(queryId=hive_20180126234646_4c4d2716-9d75-4786-8c31-1ee517688165); Time taken: 0.182 seconds
INFO : OK
No rows affected (0.351 seconds)
0: jdbc:hive2://localhost:10000/default> !sh hdfs dfs -ls /user/hive/warehouse/
Java HotSpot(TM) 64-Bit Server VM warning: ignoring option MaxPermSize=512M; support was removed in 8.0
Found 1 items
drwxrwxrwt - anonymous hive 0 2018-01-26 23:46 /user/hive/warehouse/employees.db
[donghua@cdh-vm test_db-master]$ sqoop import --connect jdbc:mysql://cdh-vm.dbaglobe.com/employees --username employee_user --password password --table current_dept_emp --split-by=emp_no --hive-import --create-hive-table --hive-table=employees.current_dept_emp --warehouse-dir=/user/hive/warehouse
Warning: /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/bin/../lib/sqoop/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
18/01/26 23:56:32 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.13.1
18/01/26 23:56:32 WARN tool.BaseSqoopTool: Setting your password on the command-line is insecure. Consider using -P instead.
18/01/26 23:56:32 INFO tool.BaseSqoopTool: Using Hive-specific delimiters for output. You can override
18/01/26 23:56:32 INFO tool.BaseSqoopTool: delimiters with --fields-terminated-by, etc.
18/01/26 23:56:32 WARN tool.BaseSqoopTool: It seems that you're doing hive import directly into default
18/01/26 23:56:32 WARN tool.BaseSqoopTool: hive warehouse directory which is not supported. Sqoop is
18/01/26 23:56:32 WARN tool.BaseSqoopTool: firstly importing data into separate directory and then
18/01/26 23:56:32 WARN tool.BaseSqoopTool: inserting data into hive. Please consider removing
18/01/26 23:56:32 WARN tool.BaseSqoopTool: --target-dir or --warehouse-dir into /user/hive/warehouse in
18/01/26 23:56:32 WARN tool.BaseSqoopTool: case that you will detect any issues.
18/01/26 23:56:32 INFO manager.MySQLManager: Preparing to use a MySQL streaming resultset.
18/01/26 23:56:32 INFO tool.CodeGenTool: Beginning code generation
18/01/26 23:56:33 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `current_dept_emp` AS t LIMIT 1
18/01/26 23:56:33 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `current_dept_emp` AS t LIMIT 1
18/01/26 23:56:33 INFO orm.CompilationManager: HADOOP_MAPRED_HOME is /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce
Note: /tmp/sqoop-donghua/compile/35ced35e8590fbbd798fa058e0584fed/current_dept_emp.java uses or overrides a deprecated API.
Note: Recompile with -Xlint:deprecation for details.
18/01/26 23:56:35 INFO orm.CompilationManager: Writing jar file: /tmp/sqoop-donghua/compile/35ced35e8590fbbd798fa058e0584fed/current_dept_emp.jar
18/01/26 23:56:35 WARN manager.MySQLManager: It looks like you are importing from mysql.
18/01/26 23:56:35 WARN manager.MySQLManager: This transfer can be faster! Use the --direct
18/01/26 23:56:35 WARN manager.MySQLManager: option to exercise a MySQL-specific fast path.
18/01/26 23:56:35 INFO manager.MySQLManager: Setting zero DATETIME behavior to convertToNull (mysql)
18/01/26 23:56:35 INFO mapreduce.ImportJobBase: Beginning import of current_dept_emp
18/01/26 23:56:35 INFO Configuration.deprecation: mapred.jar is deprecated. Instead, use mapreduce.job.jar
18/01/26 23:56:36 INFO Configuration.deprecation: mapred.map.tasks is deprecated. Instead, use mapreduce.job.maps
18/01/26 23:56:36 INFO client.RMProxy: Connecting to ResourceManager at cdh-vm.dbaglobe.com/192.168.56.10:8032
18/01/26 23:56:41 INFO db.DBInputFormat: Using read commited transaction isolation
18/01/26 23:56:41 INFO db.DataDrivenDBInputFormat: BoundingValsQuery: SELECT MIN(`emp_no`), MAX(`emp_no`) FROM `current_dept_emp`
18/01/26 23:56:41 INFO db.IntegerSplitter: Split size: 122499; Num splits: 4 from: 10001 to: 499999
18/01/26 23:56:42 INFO mapreduce.JobSubmitter: number of splits:4
18/01/26 23:56:42 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1517023991003_0003
18/01/26 23:56:42 INFO impl.YarnClientImpl: Submitted application application_1517023991003_0003
18/01/26 23:56:42 INFO mapreduce.Job: The url to track the job: http://cdh-vm.dbaglobe.com:8088/proxy/application_1517023991003_0003/
18/01/26 23:56:42 INFO mapreduce.Job: Running job: job_1517023991003_0003
18/01/26 23:56:50 INFO mapreduce.Job: Job job_1517023991003_0003 running in uber mode : false
18/01/26 23:56:50 INFO mapreduce.Job: map 0% reduce 0%
18/01/26 23:56:58 INFO mapreduce.Job: map 25% reduce 0%
18/01/26 23:57:03 INFO mapreduce.Job: map 50% reduce 0%
18/01/26 23:57:08 INFO mapreduce.Job: map 75% reduce 0%
18/01/26 23:57:13 INFO mapreduce.Job: map 100% reduce 0%
18/01/26 23:57:14 INFO mapreduce.Job: Job job_1517023991003_0003 completed successfully
18/01/26 23:57:14 INFO mapreduce.Job: Counters: 30
File System Counters
FILE: Number of bytes read=0
FILE: Number of bytes written=698232
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=464
HDFS: Number of bytes written=10110817
HDFS: Number of read operations=16
HDFS: Number of large read operations=0
HDFS: Number of write operations=8
Job Counters
Launched map tasks=4
Other local map tasks=4
Total time spent by all maps in occupied slots (ms)=17721
Total time spent by all reduces in occupied slots (ms)=0
Total time spent by all map tasks (ms)=17721
Total vcore-milliseconds taken by all map tasks=17721
Total megabyte-milliseconds taken by all map tasks=27219456
Map-Reduce Framework
Map input records=300024
Map output records=300024
Input split bytes=464
Spilled Records=0
Failed Shuffles=0
Merged Map outputs=0
GC time elapsed (ms)=313
CPU time spent (ms)=8810
Physical memory (bytes) snapshot=927260672
Virtual memory (bytes) snapshot=11156475904
Total committed heap usage (bytes)=836239360
File Input Format Counters
Bytes Read=0
File Output Format Counters
Bytes Written=10110817
18/01/26 23:57:14 INFO mapreduce.ImportJobBase: Transferred 9.6424 MB in 38.4431 seconds (256.8429 KB/sec)
18/01/26 23:57:14 INFO mapreduce.ImportJobBase: Retrieved 300024 records.
18/01/26 23:57:14 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `current_dept_emp` AS t LIMIT 1
18/01/26 23:57:15 WARN hive.TableDefWriter: Column from_date had to be cast to a less precise type in Hive
18/01/26 23:57:15 WARN hive.TableDefWriter: Column to_date had to be cast to a less precise type in Hive
18/01/26 23:57:15 INFO hive.HiveImport: Loading uploaded data into Hive
Logging initialized using configuration in jar:file:/opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/jars/hive-common-1.1.0-cdh5.13.1.jar!/hive-log4j.properties
OK
Time taken: 3.967 seconds
Loading data to table employees.current_dept_emp
Table employees.current_dept_emp stats: [numFiles=4, totalSize=10110817]
OK
Time taken: 0.85 seconds
[donghua@cdh-vm test_db-master]$ sqoop import --connect jdbc:mysql://cdh-vm.dbaglobe.com/employees --username employee_user --password password --table current_dept_emp --split-by=emp_no --hive-import --create-hive-table --hive-table=employees.current_dept_emp2 --target-dir=/user/donghua/current_dept_emp2
0: jdbc:hive2://localhost:10000/default> use employees;
INFO : Compiling command(queryId=hive_20180127000909_679b9dfa-5161-467c-9620-8081c6686c8e): use employees
INFO : Semantic Analysis Completed
INFO : Returning Hive schema: Schema(fieldSchemas:null, properties:null)
INFO : Completed compiling command(queryId=hive_20180127000909_679b9dfa-5161-467c-9620-8081c6686c8e); Time taken: 0.007 seconds
INFO : Executing command(queryId=hive_20180127000909_679b9dfa-5161-467c-9620-8081c6686c8e): use employees
INFO : Starting task [Stage-0:DDL] in serial mode
INFO : Completed executing command(queryId=hive_20180127000909_679b9dfa-5161-467c-9620-8081c6686c8e); Time taken: 0.021 seconds
INFO : OK
No rows affected (0.048 seconds)
0: jdbc:hive2://localhost:10000/default> !tables
+------------+--------------+--------------------+-------------+-------------------------------------------+--+
| TABLE_CAT | TABLE_SCHEM | TABLE_NAME | TABLE_TYPE | REMARKS |
+------------+--------------+--------------------+-------------+-------------------------------------------+--+
| | employees | current_dept_emp | TABLE | Imported by sqoop on 2018/01/26 23:57:15 |
| | employees | current_dept_emp2 | TABLE | Imported by sqoop on 2018/01/27 00:01:05 |
+------------+--------------+--------------------+-------------+-------------------------------------------+--+
0: jdbc:hive2://localhost:10000/default> !set maxcolumnwidth 200
0: jdbc:hive2://localhost:10000/default> show create table employees.current_dept_emp;
INFO : Compiling command(queryId=hive_20180127005252_fe156650-eacf-492d-8860-17af7d4fc590): show create table employees.current_dept_emp
INFO : Semantic Analysis Completed
INFO : Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:createtab_stmt, type:string, comment:from deserializer)], properties:null)
INFO : Completed compiling command(queryId=hive_20180127005252_fe156650-eacf-492d-8860-17af7d4fc590); Time taken: 0.03 seconds
INFO : Executing command(queryId=hive_20180127005252_fe156650-eacf-492d-8860-17af7d4fc590): show create table employees.current_dept_emp
INFO : Starting task [Stage-0:DDL] in serial mode
INFO : Completed executing command(queryId=hive_20180127005252_fe156650-eacf-492d-8860-17af7d4fc590); Time taken: 0.009 seconds
INFO : OK
+----------------------------------------------------------------------------------------+--+
| createtab_stmt |
+----------------------------------------------------------------------------------------+--+
| CREATE TABLE `employees.current_dept_emp`( |
| `emp_no` int, |
| `dept_no` string, |
| `from_date` string, |
| `to_date` string) |
| COMMENT 'Imported by sqoop on 2018/01/26 23:57:15' |
| ROW FORMAT SERDE |
| 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' |
| WITH SERDEPROPERTIES ( |
| 'field.delim'='\u0001', |
| 'line.delim'='\n', |
| 'serialization.format'='\u0001') |
| STORED AS INPUTFORMAT |
| 'org.apache.hadoop.mapred.TextInputFormat' |
| OUTPUTFORMAT |
| 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' |
| LOCATION |
| 'hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/employees.db/current_dept_emp' |
| TBLPROPERTIES ( |
| 'COLUMN_STATS_ACCURATE'='true', |
| 'numFiles'='4', |
| 'totalSize'='10110817', |
| 'transient_lastDdlTime'='1517029041') |
+----------------------------------------------------------------------------------------+--+
23 rows selected (0.087 seconds)
0: jdbc:hive2://localhost:10000/default> show create table employees.current_dept_emp2;
INFO : Compiling command(queryId=hive_20180127005252_e90f722a-ffd4-400d-ae8b-aa76c382dc78): show create table employees.current_dept_emp2
INFO : Semantic Analysis Completed
INFO : Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:createtab_stmt, type:string, comment:from deserializer)], properties:null)
INFO : Completed compiling command(queryId=hive_20180127005252_e90f722a-ffd4-400d-ae8b-aa76c382dc78); Time taken: 0.027 seconds
INFO : Executing command(queryId=hive_20180127005252_e90f722a-ffd4-400d-ae8b-aa76c382dc78): show create table employees.current_dept_emp2
INFO : Starting task [Stage-0:DDL] in serial mode
INFO : Completed executing command(queryId=hive_20180127005252_e90f722a-ffd4-400d-ae8b-aa76c382dc78); Time taken: 0.013 seconds
INFO : OK
+-----------------------------------------------------------------------------------------+--+
| createtab_stmt |
+-----------------------------------------------------------------------------------------+--+
| CREATE TABLE `employees.current_dept_emp2`( |
| `emp_no` int, |
| `dept_no` string, |
| `from_date` string, |
| `to_date` string) |
| COMMENT 'Imported by sqoop on 2018/01/27 00:01:05' |
| ROW FORMAT SERDE |
| 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' |
| WITH SERDEPROPERTIES ( |
| 'field.delim'='\u0001', |
| 'line.delim'='\n', |
| 'serialization.format'='\u0001') |
| STORED AS INPUTFORMAT |
| 'org.apache.hadoop.mapred.TextInputFormat' |
| OUTPUTFORMAT |
| 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' |
| LOCATION |
| 'hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/employees.db/current_dept_emp2' |
| TBLPROPERTIES ( |
| 'COLUMN_STATS_ACCURATE'='true', |
| 'numFiles'='4', |
| 'totalSize'='10110817', |
| 'transient_lastDdlTime'='1517029269') |
+-----------------------------------------------------------------------------------------+--+
23 rows selected (0.079 seconds)
0: jdbc:hive2://localhost:10000/default>
[root@cdh-vm ~]# hdfs dfs -ls /user//hive/warehouse
Found 2 items
drwxrwxrwt - donghua hive 0 2018-01-27 00:01 /user/hive/warehouse/employees.db
drwxrwxrwt - donghua hive 0 2018-01-27 00:38 /user/hive/warehouse/test.db
[hdfs@cdh-vm ~]$ hdfs dfs -ls /user//hive/warehouse/employees.db
Found 2 items
drwxrwxrwt - donghua hive 0 2018-01-26 23:57 /user/hive/warehouse/employees.db/current_dept_emp
drwxrwxrwt - donghua hive 0 2018-01-27 00:01 /user/hive/warehouse/employees.db/current_dept_emp2
0: jdbc:hive2://localhost:10000/default> select count(*) from employees.current_dept_emp;
INFO : Compiling command(queryId=hive_20180127003737_611e22dd-873d-48f6-9888-a414f9b5cf0a): select count(*) from employees.current_dept_emp
INFO : Semantic Analysis Completed
INFO : Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:_c0, type:bigint, comment:null)], properties:null)
INFO : Completed compiling command(queryId=hive_20180127003737_611e22dd-873d-48f6-9888-a414f9b5cf0a); Time taken: 0.065 seconds
INFO : Executing command(queryId=hive_20180127003737_611e22dd-873d-48f6-9888-a414f9b5cf0a): select count(*) from employees.current_dept_emp
INFO : Query ID = hive_20180127003737_611e22dd-873d-48f6-9888-a414f9b5cf0a
INFO : Total jobs = 1
INFO : Launching Job 1 out of 1
INFO : Starting task [Stage-1:MAPRED] in serial mode
INFO : Number of reduce tasks determined at compile time: 1
INFO : In order to change the average load for a reducer (in bytes):
INFO : set hive.exec.reducers.bytes.per.reducer=
INFO : In order to limit the maximum number of reducers:
INFO : set hive.exec.reducers.max=
INFO : In order to set a constant number of reducers:
INFO : set mapreduce.job.reduces=
INFO : number of splits:1
INFO : Submitting tokens for job: job_1517023991003_0007
INFO : The url to track the job: http://cdh-vm.dbaglobe.com:8088/proxy/application_1517023991003_0007/
INFO : Starting Job = job_1517023991003_0007, Tracking URL = http://cdh-vm.dbaglobe.com:8088/proxy/application_1517023991003_0007/
INFO : Kill Command = /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/lib/hadoop/bin/hadoop job -kill job_1517023991003_0007
INFO : Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1
INFO : 2018-01-27 00:37:50,690 Stage-1 map = 0%, reduce = 0%
INFO : 2018-01-27 00:37:58,188 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 1.83 sec
INFO : 2018-01-27 00:38:05,606 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 3.79 sec
INFO : MapReduce Total cumulative CPU time: 3 seconds 790 msec
INFO : Ended Job = job_1517023991003_0007
INFO : MapReduce Jobs Launched:
INFO : Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 3.79 sec HDFS Read: 10118840 HDFS Write: 7 SUCCESS
INFO : Total MapReduce CPU Time Spent: 3 seconds 790 msec
INFO : Completed executing command(queryId=hive_20180127003737_611e22dd-873d-48f6-9888-a414f9b5cf0a); Time taken: 23.26 seconds
INFO : OK
+---------+--+
| _c0 |
+---------+--+
| 300024 |
+---------+--+
1 row selected (23.371 seconds)
Use sqoop to perform incremental import (--check-column=id --incremental=append --last-value=)
[donghua@cdh-vm ~]$ mysql -u employee_user -ppassword -D employees
MariaDB [employees]> create table t1 (id int primary key, c1 varchar(10));
MariaDB [employees]> insert into t1 values(1,'a'),(2,'b');
MariaDB [employees]> select * from t1;
+----+------+
| id | c1 |
+----+------+
| 1 | a |
| 2 | b |
+----+------+
2 rows in set (0.00 sec)
[donghua@cdh-vm ~]$ beeline -u jdbc:hive2://localhost:10000/default -n donghua --silent=true
0: jdbc:hive2://localhost:10000/default> create table employees.t1(id int, c1 varchar(10));
[donghua@cdh-vm ~]$ sqoop import --connect jdbc:mysql://cdh-vm.dbaglobe.com/employees \
> --username employee_user --password password --table t1 \
> --split-by=id --hive-import --hive-table=employees.t1 \
> --check-column=id --incremental=append --last-value=0
Warning: /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/bin/../lib/sqoop/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
18/01/27 03:54:38 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.13.1
18/01/27 03:54:38 WARN tool.BaseSqoopTool: Setting your password on the command-line is insecure. Consider using -P instead.
18/01/27 03:54:38 INFO tool.BaseSqoopTool: Using Hive-specific delimiters for output. You can override
18/01/27 03:54:38 INFO tool.BaseSqoopTool: delimiters with --fields-terminated-by, etc.
18/01/27 03:54:38 WARN tool.BaseSqoopTool: It seems that you're doing hive import directly into default
18/01/27 03:54:38 WARN tool.BaseSqoopTool: hive warehouse directory which is not supported. Sqoop is
18/01/27 03:54:38 WARN tool.BaseSqoopTool: firstly importing data into separate directory and then
18/01/27 03:54:38 WARN tool.BaseSqoopTool: inserting data into hive. Please consider removing
18/01/27 03:54:38 WARN tool.BaseSqoopTool: --target-dir or --warehouse-dir into /user/hive/warehouse in
18/01/27 03:54:38 WARN tool.BaseSqoopTool: case that you will detect any issues.
18/01/27 03:54:38 INFO manager.MySQLManager: Preparing to use a MySQL streaming resultset.
18/01/27 03:54:38 INFO tool.CodeGenTool: Beginning code generation
18/01/27 03:54:38 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `t1` AS t LIMIT 1
18/01/27 03:54:38 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `t1` AS t LIMIT 1
18/01/27 03:54:38 INFO orm.CompilationManager: HADOOP_MAPRED_HOME is /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce
Note: /tmp/sqoop-donghua/compile/1941b9efeafd888916e872561fa71b1d/t1.java uses or overrides a deprecated API.
Note: Recompile with -Xlint:deprecation for details.
18/01/27 03:54:40 INFO orm.CompilationManager: Writing jar file: /tmp/sqoop-donghua/compile/1941b9efeafd888916e872561fa71b1d/t1.jar
18/01/27 03:54:41 INFO tool.ImportTool: Maximal id query for free form incremental import: SELECT MAX(`id`) FROM `t1`
18/01/27 03:54:41 INFO tool.ImportTool: Incremental import based on column `id`
18/01/27 03:54:41 INFO tool.ImportTool: Lower bound value: 0
18/01/27 03:54:41 INFO tool.ImportTool: Upper bound value: 2
18/01/27 03:54:41 WARN manager.MySQLManager: It looks like you are importing from mysql.
18/01/27 03:54:41 WARN manager.MySQLManager: This transfer can be faster! Use the --direct
18/01/27 03:54:41 WARN manager.MySQLManager: option to exercise a MySQL-specific fast path.
18/01/27 03:54:41 INFO manager.MySQLManager: Setting zero DATETIME behavior to convertToNull (mysql)
18/01/27 03:54:41 INFO mapreduce.ImportJobBase: Beginning import of t1
18/01/27 03:54:41 INFO Configuration.deprecation: mapred.jar is deprecated. Instead, use mapreduce.job.jar
18/01/27 03:54:41 INFO Configuration.deprecation: mapred.map.tasks is deprecated. Instead, use mapreduce.job.maps
18/01/27 03:54:41 INFO client.RMProxy: Connecting to ResourceManager at cdh-vm.dbaglobe.com/192.168.56.10:8032
18/01/27 03:54:46 INFO db.DBInputFormat: Using read commited transaction isolation
18/01/27 03:54:46 INFO db.DataDrivenDBInputFormat: BoundingValsQuery: SELECT MIN(`id`), MAX(`id`) FROM `t1` WHERE ( `id` > 0 AND `id` <= 2 )
18/01/27 03:54:46 INFO db.IntegerSplitter: Split size: 0; Num splits: 4 from: 1 to: 2
18/01/27 03:54:46 INFO mapreduce.JobSubmitter: number of splits:2
18/01/27 03:54:47 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1517023991003_0009
18/01/27 03:54:47 INFO impl.YarnClientImpl: Submitted application application_1517023991003_0009
18/01/27 03:54:47 INFO mapreduce.Job: The url to track the job: http://cdh-vm.dbaglobe.com:8088/proxy/application_1517023991003_0009/
18/01/27 03:54:47 INFO mapreduce.Job: Running job: job_1517023991003_0009
18/01/27 03:54:54 INFO mapreduce.Job: Job job_1517023991003_0009 running in uber mode : false
18/01/27 03:54:54 INFO mapreduce.Job: map 0% reduce 0%
18/01/27 03:55:02 INFO mapreduce.Job: map 50% reduce 0%
18/01/27 03:55:06 INFO mapreduce.Job: map 100% reduce 0%
18/01/27 03:55:07 INFO mapreduce.Job: Job job_1517023991003_0009 completed successfully
18/01/27 03:55:07 INFO mapreduce.Job: Counters: 30
File System Counters
FILE: Number of bytes read=0
FILE: Number of bytes written=350308
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=197
HDFS: Number of bytes written=8
HDFS: Number of read operations=8
HDFS: Number of large read operations=0
HDFS: Number of write operations=4
Job Counters
Launched map tasks=2
Other local map tasks=2
Total time spent by all maps in occupied slots (ms)=7843
Total time spent by all reduces in occupied slots (ms)=0
Total time spent by all map tasks (ms)=7843
Total vcore-milliseconds taken by all map tasks=7843
Total megabyte-milliseconds taken by all map tasks=12046848
Map-Reduce Framework
Map input records=2
Map output records=2
Input split bytes=197
Spilled Records=0
Failed Shuffles=0
Merged Map outputs=0
GC time elapsed (ms)=110
CPU time spent (ms)=1970
Physical memory (bytes) snapshot=413765632
Virtual memory (bytes) snapshot=5572857856
Total committed heap usage (bytes)=402653184
File Input Format Counters
Bytes Read=0
File Output Format Counters
Bytes Written=8
18/01/27 03:55:07 INFO mapreduce.ImportJobBase: Transferred 8 bytes in 26.2002 seconds (0.3053 bytes/sec)
18/01/27 03:55:07 INFO mapreduce.ImportJobBase: Retrieved 2 records.
18/01/27 03:55:07 INFO util.AppendUtils: Creating missing output directory - t1
18/01/27 03:55:07 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `t1` AS t LIMIT 1
18/01/27 03:55:07 INFO hive.HiveImport: Loading uploaded data into Hive
Logging initialized using configuration in jar:file:/opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/jars/hive-common-1.1.0-cdh5.13.1.jar!/hive-log4j.properties
OK
Time taken: 2.037 seconds
Loading data to table employees.t1
Table employees.t1 stats: [numFiles=2, totalSize=8]
OK
Time taken: 0.646 seconds
0: jdbc:hive2://localhost:10000/default> select * from employees.t1;
+--------+--------+--+
| t1.id | t1.c1 |
+--------+--------+--+
| 1 | a |
| 2 | b |
+--------+--------+--+
MariaDB [employees]> insert into t1 values(3,'a'),(4,'b');
Query OK, 2 rows affected (0.00 sec)
Records: 2 Duplicates: 0 Warnings: 0
[donghua@cdh-vm ~]$ sqoop import --connect jdbc:mysql://cdh-vm.dbaglobe.com/employees --username employee_user --password password --table t1 --split-by=id --hive-import --hive-table=employees.t1 --warehouse-dir=/user/hive/warehouse --check-column=id --incremental=append --last-value=2
Warning: /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/bin/../lib/sqoop/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
18/01/27 04:11:31 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.13.1
18/01/27 04:11:31 WARN tool.BaseSqoopTool: Setting your password on the command-line is insecure. Consider using -P instead.
18/01/27 04:11:31 INFO tool.BaseSqoopTool: Using Hive-specific delimiters for output. You can override
18/01/27 04:11:31 INFO tool.BaseSqoopTool: delimiters with --fields-terminated-by, etc.
18/01/27 04:11:31 WARN tool.BaseSqoopTool: It seems that you're doing hive import directly into default
18/01/27 04:11:31 WARN tool.BaseSqoopTool: hive warehouse directory which is not supported. Sqoop is
18/01/27 04:11:31 WARN tool.BaseSqoopTool: firstly importing data into separate directory and then
18/01/27 04:11:31 WARN tool.BaseSqoopTool: inserting data into hive. Please consider removing
18/01/27 04:11:31 WARN tool.BaseSqoopTool: --target-dir or --warehouse-dir into /user/hive/warehouse in
18/01/27 04:11:31 WARN tool.BaseSqoopTool: case that you will detect any issues.
18/01/27 04:11:31 INFO manager.MySQLManager: Preparing to use a MySQL streaming resultset.
18/01/27 04:11:31 INFO tool.CodeGenTool: Beginning code generation
18/01/27 04:11:31 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `t1` AS t LIMIT 1
18/01/27 04:11:31 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `t1` AS t LIMIT 1
18/01/27 04:11:31 INFO orm.CompilationManager: HADOOP_MAPRED_HOME is /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce
Note: /tmp/sqoop-donghua/compile/80c2f1f6c1f1b6c4b9fca928aa6353a8/t1.java uses or overrides a deprecated API.
Note: Recompile with -Xlint:deprecation for details.
18/01/27 04:11:33 INFO orm.CompilationManager: Writing jar file: /tmp/sqoop-donghua/compile/80c2f1f6c1f1b6c4b9fca928aa6353a8/t1.jar
18/01/27 04:11:34 INFO tool.ImportTool: Maximal id query for free form incremental import: SELECT MAX(`id`) FROM `t1`
18/01/27 04:11:34 INFO tool.ImportTool: Incremental import based on column `id`
18/01/27 04:11:34 INFO tool.ImportTool: Lower bound value: 2
18/01/27 04:11:34 INFO tool.ImportTool: Upper bound value: 4
18/01/27 04:11:34 WARN manager.MySQLManager: It looks like you are importing from mysql.
18/01/27 04:11:34 WARN manager.MySQLManager: This transfer can be faster! Use the --direct
18/01/27 04:11:34 WARN manager.MySQLManager: option to exercise a MySQL-specific fast path.
18/01/27 04:11:34 INFO manager.MySQLManager: Setting zero DATETIME behavior to convertToNull (mysql)
18/01/27 04:11:34 INFO mapreduce.ImportJobBase: Beginning import of t1
18/01/27 04:11:34 INFO Configuration.deprecation: mapred.jar is deprecated. Instead, use mapreduce.job.jar
18/01/27 04:11:34 INFO Configuration.deprecation: mapred.map.tasks is deprecated. Instead, use mapreduce.job.maps
18/01/27 04:11:34 INFO client.RMProxy: Connecting to ResourceManager at cdh-vm.dbaglobe.com/192.168.56.10:8032
18/01/27 04:11:38 INFO db.DBInputFormat: Using read commited transaction isolation
18/01/27 04:11:38 INFO db.DataDrivenDBInputFormat: BoundingValsQuery: SELECT MIN(`id`), MAX(`id`) FROM `t1` WHERE ( `id` > 2 AND `id` <= 4 )
18/01/27 04:11:38 INFO db.IntegerSplitter: Split size: 0; Num splits: 4 from: 3 to: 4
18/01/27 04:11:38 INFO mapreduce.JobSubmitter: number of splits:2
18/01/27 04:11:38 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1517023991003_0012
18/01/27 04:11:38 INFO impl.YarnClientImpl: Submitted application application_1517023991003_0012
18/01/27 04:11:38 INFO mapreduce.Job: The url to track the job: http://cdh-vm.dbaglobe.com:8088/proxy/application_1517023991003_0012/
18/01/27 04:11:38 INFO mapreduce.Job: Running job: job_1517023991003_0012
18/01/27 04:11:45 INFO mapreduce.Job: Job job_1517023991003_0012 running in uber mode : false
18/01/27 04:11:45 INFO mapreduce.Job: map 0% reduce 0%
18/01/27 04:11:51 INFO mapreduce.Job: map 50% reduce 0%
18/01/27 04:11:57 INFO mapreduce.Job: map 100% reduce 0%
18/01/27 04:11:57 INFO mapreduce.Job: Job job_1517023991003_0012 completed successfully
18/01/27 04:11:57 INFO mapreduce.Job: Counters: 30
File System Counters
FILE: Number of bytes read=0
FILE: Number of bytes written=350308
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=197
HDFS: Number of bytes written=8
HDFS: Number of read operations=8
HDFS: Number of large read operations=0
HDFS: Number of write operations=4
Job Counters
Launched map tasks=2
Other local map tasks=2
Total time spent by all maps in occupied slots (ms)=7531
Total time spent by all reduces in occupied slots (ms)=0
Total time spent by all map tasks (ms)=7531
Total vcore-milliseconds taken by all map tasks=7531
Total megabyte-milliseconds taken by all map tasks=11567616
Map-Reduce Framework
Map input records=2
Map output records=2
Input split bytes=197
Spilled Records=0
Failed Shuffles=0
Merged Map outputs=0
GC time elapsed (ms)=114
CPU time spent (ms)=1800
Physical memory (bytes) snapshot=403120128
Virtual memory (bytes) snapshot=5573816320
Total committed heap usage (bytes)=359137280
File Input Format Counters
Bytes Read=0
File Output Format Counters
Bytes Written=8
18/01/27 04:11:57 INFO mapreduce.ImportJobBase: Transferred 8 bytes in 23.359 seconds (0.3425 bytes/sec)
18/01/27 04:11:57 INFO mapreduce.ImportJobBase: Retrieved 2 records.
18/01/27 04:11:57 INFO util.AppendUtils: Creating missing output directory - t1
18/01/27 04:11:57 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `t1` AS t LIMIT 1
18/01/27 04:11:57 INFO hive.HiveImport: Loading uploaded data into Hive
Logging initialized using configuration in jar:file:/opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/jars/hive-common-1.1.0-cdh5.13.1.jar!/hive-log4j.properties
OK
Time taken: 1.853 seconds
Loading data to table employees.t1
Table employees.t1 stats: [numFiles=4, numRows=0, totalSize=16, rawDataSize=0]
OK
Time taken: 0.603 seconds
0: jdbc:hive2://localhost:10000/default> select * from employees.t1;
+--------+--------+--+
| t1.id | t1.c1 |
+--------+--------+--+
| 1 | a |
| 3 | a |
| 2 | b |
| 4 | b |
+--------+--------+--+
[donghua@cdh-vm ~]$ sqoop job --create emp_t1_incr -- import --connect jdbc:mysql://cdh-vm.dbaglobe.com/employees --username employee_user --password password --table t1 --split-by=id --hive-import --hive-table=employees.t1 --warehouse-dir=/user/hive/warehouse --check-column=id --incremental=append --last-value=4
Warning: /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/bin/../lib/sqoop/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
18/01/27 04:21:32 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.13.1
18/01/27 04:21:32 WARN tool.BaseSqoopTool: Setting your password on the command-line is insecure. Consider using -P instead.
18/01/27 04:21:32 INFO tool.BaseSqoopTool: Using Hive-specific delimiters for output. You can override
18/01/27 04:21:32 INFO tool.BaseSqoopTool: delimiters with --fields-terminated-by, etc.
18/01/27 04:21:32 WARN tool.BaseSqoopTool: It seems that you're doing hive import directly into default
18/01/27 04:21:32 WARN tool.BaseSqoopTool: hive warehouse directory which is not supported. Sqoop is
18/01/27 04:21:32 WARN tool.BaseSqoopTool: firstly importing data into separate directory and then
18/01/27 04:21:32 WARN tool.BaseSqoopTool: inserting data into hive. Please consider removing
18/01/27 04:21:32 WARN tool.BaseSqoopTool: --target-dir or --warehouse-dir into /user/hive/warehouse in
18/01/27 04:21:32 WARN tool.BaseSqoopTool: case that you will detect any issues.
[donghua@cdh-vm ~]$ sqoop job --list
Warning: /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/bin/../lib/sqoop/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
18/01/27 04:30:13 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.13.1
Available jobs:
emp_t1_incr
[donghua@cdh-vm ~]$ sqoop job --show emp_t1_incr
Warning: /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/bin/../lib/sqoop/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
18/01/27 04:35:40 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.13.1
Enter password:
Job: emp_t1_incr
Tool: import
Options:
----------------------------
verbose = false
hcatalog.drop.and.create.table = false
incremental.last.value = 4
db.connect.string = jdbc:mysql://cdh-vm.dbaglobe.com/employees
codegen.output.delimiters.escape = 0
codegen.output.delimiters.enclose.required = false
codegen.input.delimiters.field = 0
mainframe.input.dataset.type = p
split.limit = null
hbase.create.table = false
db.require.password = true
hdfs.append.dir = true
db.table = t1
codegen.input.delimiters.escape = 0
accumulo.create.table = false
import.fetch.size = null
codegen.input.delimiters.enclose.required = false
db.username = employee_user
reset.onemapper = false
codegen.output.delimiters.record = 10
import.max.inline.lob.size = 16777216
sqoop.throwOnError = false
hbase.bulk.load.enabled = false
hcatalog.create.table = false
db.clear.staging.table = false
incremental.col = id
codegen.input.delimiters.record = 0
hdfs.warehouse.dir = /user/hive/warehouse
enable.compression = false
hive.overwrite.table = false
hive.import = true
codegen.input.delimiters.enclose = 0
hive.table.name = employees.t1
accumulo.batch.size = 10240000
hive.drop.delims = false
customtool.options.jsonmap = {}
codegen.output.delimiters.enclose = 0
hdfs.delete-target.dir = false
codegen.output.dir = .
codegen.auto.compile.dir = true
relaxed.isolation = false
mapreduce.num.mappers = 4
accumulo.max.latency = 5000
import.direct.split.size = 0
sqlconnection.metadata.transaction.isolation.level = 2
codegen.output.delimiters.field = 1
export.new.update = UpdateOnly
incremental.mode = AppendRows
hdfs.file.format = TextFile
sqoop.oracle.escaping.disabled = true
codegen.compile.dir = /tmp/sqoop-donghua/compile/e7212eb92686a1486fa1cd44a6c9afc7
direct.import = false
temporary.dirRoot = _sqoop
db.split.column = id
hive.fail.table.exists = false
db.batch = false
[donghua@cdh-vm ~]$ sqoop job --exec emp_t1_incr
Warning: /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/bin/../lib/sqoop/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
18/01/27 04:38:01 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.13.1
Enter password:
18/01/27 04:38:06 WARN tool.BaseSqoopTool: It seems that you're doing hive import directly into default
18/01/27 04:38:06 WARN tool.BaseSqoopTool: hive warehouse directory which is not supported. Sqoop is
18/01/27 04:38:06 WARN tool.BaseSqoopTool: firstly importing data into separate directory and then
18/01/27 04:38:06 WARN tool.BaseSqoopTool: inserting data into hive. Please consider removing
18/01/27 04:38:06 WARN tool.BaseSqoopTool: --target-dir or --warehouse-dir into /user/hive/warehouse in
18/01/27 04:38:06 WARN tool.BaseSqoopTool: case that you will detect any issues.
18/01/27 04:38:06 INFO manager.MySQLManager: Preparing to use a MySQL streaming resultset.
18/01/27 04:38:06 INFO tool.CodeGenTool: Beginning code generation
18/01/27 04:38:07 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `t1` AS t LIMIT 1
18/01/27 04:38:07 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `t1` AS t LIMIT 1
18/01/27 04:38:07 INFO orm.CompilationManager: HADOOP_MAPRED_HOME is /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce
Note: /tmp/sqoop-donghua/compile/e3c397f1a5469f870ba19e95b80a66a2/t1.java uses or overrides a deprecated API.
Note: Recompile with -Xlint:deprecation for details.
18/01/27 04:38:08 INFO orm.CompilationManager: Writing jar file: /tmp/sqoop-donghua/compile/e3c397f1a5469f870ba19e95b80a66a2/t1.jar
18/01/27 04:38:09 INFO tool.ImportTool: Maximal id query for free form incremental import: SELECT MAX(`id`) FROM `t1`
18/01/27 04:38:09 INFO tool.ImportTool: Incremental import based on column `id`
18/01/27 04:38:09 INFO tool.ImportTool: Lower bound value: 4
18/01/27 04:38:09 INFO tool.ImportTool: Upper bound value: 6
18/01/27 04:38:09 WARN manager.MySQLManager: It looks like you are importing from mysql.
18/01/27 04:38:09 WARN manager.MySQLManager: This transfer can be faster! Use the --direct
18/01/27 04:38:09 WARN manager.MySQLManager: option to exercise a MySQL-specific fast path.
18/01/27 04:38:09 INFO manager.MySQLManager: Setting zero DATETIME behavior to convertToNull (mysql)
18/01/27 04:38:09 INFO mapreduce.ImportJobBase: Beginning import of t1
18/01/27 04:38:09 INFO Configuration.deprecation: mapred.jar is deprecated. Instead, use mapreduce.job.jar
18/01/27 04:38:09 INFO Configuration.deprecation: mapred.map.tasks is deprecated. Instead, use mapreduce.job.maps
18/01/27 04:38:09 INFO client.RMProxy: Connecting to ResourceManager at cdh-vm.dbaglobe.com/192.168.56.10:8032
18/01/27 04:38:12 INFO db.DBInputFormat: Using read commited transaction isolation
18/01/27 04:38:12 INFO db.DataDrivenDBInputFormat: BoundingValsQuery: SELECT MIN(`id`), MAX(`id`) FROM `t1` WHERE ( `id` > 4 AND `id` <= 6 )
18/01/27 04:38:12 INFO db.IntegerSplitter: Split size: 0; Num splits: 4 from: 5 to: 6
18/01/27 04:38:12 INFO mapreduce.JobSubmitter: number of splits:2
18/01/27 04:38:12 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1517023991003_0013
18/01/27 04:38:12 INFO impl.YarnClientImpl: Submitted application application_1517023991003_0013
18/01/27 04:38:12 INFO mapreduce.Job: The url to track the job: http://cdh-vm.dbaglobe.com:8088/proxy/application_1517023991003_0013/
18/01/27 04:38:12 INFO mapreduce.Job: Running job: job_1517023991003_0013
18/01/27 04:38:20 INFO mapreduce.Job: Job job_1517023991003_0013 running in uber mode : false
18/01/27 04:38:20 INFO mapreduce.Job: map 0% reduce 0%
18/01/27 04:38:26 INFO mapreduce.Job: map 50% reduce 0%
18/01/27 04:38:31 INFO mapreduce.Job: map 100% reduce 0%
18/01/27 04:38:32 INFO mapreduce.Job: Job job_1517023991003_0013 completed successfully
18/01/27 04:38:33 INFO mapreduce.Job: Counters: 30
File System Counters
FILE: Number of bytes read=0
FILE: Number of bytes written=351166
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=197
HDFS: Number of bytes written=8
HDFS: Number of read operations=8
HDFS: Number of large read operations=0
HDFS: Number of write operations=4
Job Counters
Launched map tasks=2
Other local map tasks=2
Total time spent by all maps in occupied slots (ms)=8071
Total time spent by all reduces in occupied slots (ms)=0
Total time spent by all map tasks (ms)=8071
Total vcore-milliseconds taken by all map tasks=8071
Total megabyte-milliseconds taken by all map tasks=12397056
Map-Reduce Framework
Map input records=2
Map output records=2
Input split bytes=197
Spilled Records=0
Failed Shuffles=0
Merged Map outputs=0
GC time elapsed (ms)=136
CPU time spent (ms)=1820
Physical memory (bytes) snapshot=404738048
Virtual memory (bytes) snapshot=5573140480
Total committed heap usage (bytes)=354942976
File Input Format Counters
Bytes Read=0
File Output Format Counters
Bytes Written=8
18/01/27 04:38:33 INFO mapreduce.ImportJobBase: Transferred 8 bytes in 23.4364 seconds (0.3413 bytes/sec)
18/01/27 04:38:33 INFO mapreduce.ImportJobBase: Retrieved 2 records.
18/01/27 04:38:33 INFO util.AppendUtils: Creating missing output directory - t1
18/01/27 04:38:33 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `t1` AS t LIMIT 1
18/01/27 04:38:33 INFO hive.HiveImport: Loading uploaded data into Hive
Logging initialized using configuration in jar:file:/opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/jars/hive-common-1.1.0-cdh5.13.1.jar!/hive-log4j.properties
OK
Time taken: 1.95 seconds
Loading data to table employees.t1
Table employees.t1 stats: [numFiles=6, numRows=0, totalSize=24, rawDataSize=0]
OK
Time taken: 0.664 seconds
[donghua@cdh-vm ~]$ sqoop job --show emp_t1_incr
Warning: /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/bin/../lib/sqoop/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
18/01/27 04:38:50 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.13.1
Enter password:
Job: emp_t1_incr
Tool: import
Options:
----------------------------
verbose = false
hcatalog.drop.and.create.table = false
incremental.last.value = 6
db.connect.string = jdbc:mysql://cdh-vm.dbaglobe.com/employees
codegen.output.delimiters.escape = 0
codegen.output.delimiters.enclose.required = false
codegen.input.delimiters.field = 0
mainframe.input.dataset.type = p
split.limit = null
hbase.create.table = false
db.require.password = true
hdfs.append.dir = true
db.table = t1
codegen.input.delimiters.escape = 0
accumulo.create.table = false
import.fetch.size = null
codegen.input.delimiters.enclose.required = false
db.username = employee_user
reset.onemapper = false
codegen.output.delimiters.record = 10
import.max.inline.lob.size = 16777216
sqoop.throwOnError = false
hbase.bulk.load.enabled = false
hcatalog.create.table = false
db.clear.staging.table = false
incremental.col = id
codegen.input.delimiters.record = 0
hdfs.warehouse.dir = /user/hive/warehouse
enable.compression = false
hive.overwrite.table = false
hive.import = true
codegen.input.delimiters.enclose = 0
hive.table.name = employees.t1
accumulo.batch.size = 10240000
hive.drop.delims = false
customtool.options.jsonmap = {}
codegen.output.delimiters.enclose = 0
hdfs.delete-target.dir = false
codegen.output.dir = .
codegen.auto.compile.dir = true
relaxed.isolation = false
mapreduce.num.mappers = 4
accumulo.max.latency = 5000
import.direct.split.size = 0
sqlconnection.metadata.transaction.isolation.level = 2
codegen.output.delimiters.field = 1
export.new.update = UpdateOnly
incremental.mode = AppendRows
hdfs.file.format = TextFile
sqoop.oracle.escaping.disabled = true
codegen.compile.dir = /tmp/sqoop-donghua/compile/39496d079794ae53a008a2da9cd2ac4a
direct.import = false
temporary.dirRoot = _sqoop
db.split.column = id
hive.fail.table.exists = false
db.batch = false
Use Sqoop to perform incremental data loading (--incremental=lastmodified for timestamp column)
1,2018-01-27 04:50:07.0
2,2018-01-27 04:50:18.0
[donghua@cdh-vm ~]$
MariaDB [employees]> insert into t2 values(3,current_timestamp());
Query OK, 1 row affected (0.01 sec)
MariaDB [employees]> update t2 set last_updated_at=current_timestamp() where id=2;
Query OK, 1 row affected (0.07 sec)
Rows matched: 1 Changed: 1 Warnings: 0
MariaDB [employees]> select * from t2;
+----+---------------------+
| id | last_updated_at |
+----+---------------------+
| 1 | 2018-01-27 04:50:07 |
| 2 | 2018-01-27 05:10:14 |
| 3 | 2018-01-27 05:09:45 |
+----+---------------------+
3 rows in set (0.00 sec)
MariaDB [employees]>
[donghua@cdh-vm ~]$ sqoop import --connect jdbc:mysql://cdh-vm.dbaglobe.com/employees --username employee_user --password password --table t2 -m 1 --target-dir=/user/donghua/t2 --check-column=last_updated_at --incremental=lastmodified --last-value='2018-01-27 05:06:03.0'
Warning: /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/bin/../lib/sqoop/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
18/01/27 05:11:59 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.13.1
18/01/27 05:11:59 WARN tool.BaseSqoopTool: Setting your password on the command-line is insecure. Consider using -P instead.
18/01/27 05:12:00 INFO manager.MySQLManager: Preparing to use a MySQL streaming resultset.
18/01/27 05:12:00 INFO tool.CodeGenTool: Beginning code generation
18/01/27 05:12:00 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `t2` AS t LIMIT 1
18/01/27 05:12:00 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `t2` AS t LIMIT 1
18/01/27 05:12:00 INFO orm.CompilationManager: HADOOP_MAPRED_HOME is /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce
Note: /tmp/sqoop-donghua/compile/631be22fe0124698ede97beba0c8288e/t2.java uses or overrides a deprecated API.
Note: Recompile with -Xlint:deprecation for details.
18/01/27 05:12:01 INFO orm.CompilationManager: Writing jar file: /tmp/sqoop-donghua/compile/631be22fe0124698ede97beba0c8288e/t2.jar
18/01/27 05:12:02 ERROR tool.ImportTool: Import failed: --merge-key or --append is required when using --incremental lastmodified and the output directory exists.
[donghua@cdh-vm ~]$ sqoop import --connect jdbc:mysql://cdh-vm.dbaglobe.com/employees --username employee_user --password password --table t2 -m 1 --target-dir=/user/donghua/t2 --check-column=last_updated_at --incremental=lastmodified --last-value='2018-01-27 05:06:03.0' --merge-key=id
Warning: /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/bin/../lib/sqoop/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
18/01/27 05:12:40 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.13.1
18/01/27 05:12:40 WARN tool.BaseSqoopTool: Setting your password on the command-line is insecure. Consider using -P instead.
18/01/27 05:12:40 INFO manager.MySQLManager: Preparing to use a MySQL streaming resultset.
18/01/27 05:12:40 INFO tool.CodeGenTool: Beginning code generation
18/01/27 05:12:41 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `t2` AS t LIMIT 1
18/01/27 05:12:41 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `t2` AS t LIMIT 1
18/01/27 05:12:41 INFO orm.CompilationManager: HADOOP_MAPRED_HOME is /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce
Note: /tmp/sqoop-donghua/compile/2e4f90897221b505b822c323c3cb2b41/t2.java uses or overrides a deprecated API.
Note: Recompile with -Xlint:deprecation for details.
18/01/27 05:12:42 INFO orm.CompilationManager: Writing jar file: /tmp/sqoop-donghua/compile/2e4f90897221b505b822c323c3cb2b41/t2.jar
18/01/27 05:12:43 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `t2` AS t LIMIT 1
18/01/27 05:12:43 INFO tool.ImportTool: Incremental import based on column `last_updated_at`
18/01/27 05:12:43 INFO tool.ImportTool: Lower bound value: '2018-01-27 05:06:03.0'
18/01/27 05:12:43 INFO tool.ImportTool: Upper bound value: '2018-01-27 05:12:43.0'
18/01/27 05:12:43 WARN manager.MySQLManager: It looks like you are importing from mysql.
18/01/27 05:12:43 WARN manager.MySQLManager: This transfer can be faster! Use the --direct
18/01/27 05:12:43 WARN manager.MySQLManager: option to exercise a MySQL-specific fast path.
18/01/27 05:12:43 INFO manager.MySQLManager: Setting zero DATETIME behavior to convertToNull (mysql)
18/01/27 05:12:43 INFO mapreduce.ImportJobBase: Beginning import of t2
18/01/27 05:12:43 INFO Configuration.deprecation: mapred.jar is deprecated. Instead, use mapreduce.job.jar
18/01/27 05:12:43 INFO Configuration.deprecation: mapred.map.tasks is deprecated. Instead, use mapreduce.job.maps
18/01/27 05:12:43 INFO client.RMProxy: Connecting to ResourceManager at cdh-vm.dbaglobe.com/192.168.56.10:8032
18/01/27 05:12:47 INFO db.DBInputFormat: Using read commited transaction isolation
18/01/27 05:12:47 INFO mapreduce.JobSubmitter: number of splits:1
18/01/27 05:12:48 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1517023991003_0016
18/01/27 05:12:48 INFO impl.YarnClientImpl: Submitted application application_1517023991003_0016
18/01/27 05:12:48 INFO mapreduce.Job: The url to track the job: http://cdh-vm.dbaglobe.com:8088/proxy/application_1517023991003_0016/
18/01/27 05:12:48 INFO mapreduce.Job: Running job: job_1517023991003_0016
18/01/27 05:12:55 INFO mapreduce.Job: Job job_1517023991003_0016 running in uber mode : false
18/01/27 05:12:55 INFO mapreduce.Job: map 0% reduce 0%
18/01/27 05:13:01 INFO mapreduce.Job: map 100% reduce 0%
18/01/27 05:13:02 INFO mapreduce.Job: Job job_1517023991003_0016 completed successfully
18/01/27 05:13:02 INFO mapreduce.Job: Counters: 30
File System Counters
FILE: Number of bytes read=0
FILE: Number of bytes written=175177
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=87
HDFS: Number of bytes written=48
HDFS: Number of read operations=4
HDFS: Number of large read operations=0
HDFS: Number of write operations=2
Job Counters
Launched map tasks=1
Other local map tasks=1
Total time spent by all maps in occupied slots (ms)=4073
Total time spent by all reduces in occupied slots (ms)=0
Total time spent by all map tasks (ms)=4073
Total vcore-milliseconds taken by all map tasks=4073
Total megabyte-milliseconds taken by all map tasks=6256128
Map-Reduce Framework
Map input records=2
Map output records=2
Input split bytes=87
Spilled Records=0
Failed Shuffles=0
Merged Map outputs=0
GC time elapsed (ms)=61
CPU time spent (ms)=920
Physical memory (bytes) snapshot=196935680
Virtual memory (bytes) snapshot=2785828864
Total committed heap usage (bytes)=155713536
File Input Format Counters
Bytes Read=0
File Output Format Counters
Bytes Written=48
18/01/27 05:13:02 INFO mapreduce.ImportJobBase: Transferred 48 bytes in 19.1241 seconds (2.5099 bytes/sec)
18/01/27 05:13:02 INFO mapreduce.ImportJobBase: Retrieved 2 records.
18/01/27 05:13:02 INFO tool.ImportTool: Final destination exists, will run merge job.
18/01/27 05:13:02 INFO Configuration.deprecation: mapred.output.key.class is deprecated. Instead, use mapreduce.job.output.key.class
18/01/27 05:13:02 INFO client.RMProxy: Connecting to ResourceManager at cdh-vm.dbaglobe.com/192.168.56.10:8032
18/01/27 05:13:07 INFO input.FileInputFormat: Total input paths to process : 2
18/01/27 05:13:07 INFO mapreduce.JobSubmitter: number of splits:2
18/01/27 05:13:07 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1517023991003_0017
18/01/27 05:13:08 INFO impl.YarnClientImpl: Submitted application application_1517023991003_0017
18/01/27 05:13:08 INFO mapreduce.Job: The url to track the job: http://cdh-vm.dbaglobe.com:8088/proxy/application_1517023991003_0017/
18/01/27 05:13:08 INFO mapreduce.Job: Running job: job_1517023991003_0017
18/01/27 05:13:15 INFO mapreduce.Job: Job job_1517023991003_0017 running in uber mode : false
18/01/27 05:13:15 INFO mapreduce.Job: map 0% reduce 0%
18/01/27 05:13:20 INFO mapreduce.Job: map 50% reduce 0%
18/01/27 05:13:24 INFO mapreduce.Job: map 100% reduce 0%
18/01/27 05:13:31 INFO mapreduce.Job: map 100% reduce 100%
18/01/27 05:13:31 INFO mapreduce.Job: Job job_1517023991003_0017 completed successfully
18/01/27 05:13:31 INFO mapreduce.Job: Counters: 49
File System Counters
FILE: Number of bytes read=90
FILE: Number of bytes written=526653
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=386
HDFS: Number of bytes written=72
HDFS: Number of read operations=9
HDFS: Number of large read operations=0
HDFS: Number of write operations=2
Job Counters
Launched map tasks=2
Launched reduce tasks=1
Data-local map tasks=2
Total time spent by all maps in occupied slots (ms)=6496
Total time spent by all reduces in occupied slots (ms)=3317
Total time spent by all map tasks (ms)=6496
Total time spent by all reduce tasks (ms)=3317
Total vcore-milliseconds taken by all map tasks=6496
Total vcore-milliseconds taken by all reduce tasks=3317
Total megabyte-milliseconds taken by all map tasks=9977856
Total megabyte-milliseconds taken by all reduce tasks=5094912
Map-Reduce Framework
Map input records=4
Map output records=4
Map output bytes=96
Map output materialized bytes=122
Input split bytes=290
Combine input records=0
Combine output records=0
Reduce input groups=3
Reduce shuffle bytes=122
Reduce input records=4
Reduce output records=3
Spilled Records=8
Shuffled Maps =2
Failed Shuffles=0
Merged Map outputs=2
GC time elapsed (ms)=211
CPU time spent (ms)=1900
Physical memory (bytes) snapshot=1147371520
Virtual memory (bytes) snapshot=8375828480
Total committed heap usage (bytes)=1154482176
Shuffle Errors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
File Input Format Counters
Bytes Read=96
File Output Format Counters
Bytes Written=72
18/01/27 05:13:31 INFO tool.ImportTool: Incremental import complete! To run another incremental import of all data following this import, supply the following arguments:
18/01/27 05:13:31 INFO tool.ImportTool: --incremental lastmodified
18/01/27 05:13:31 INFO tool.ImportTool: --check-column last_updated_at
18/01/27 05:13:31 INFO tool.ImportTool: --last-value 2018-01-27 05:12:43.0
18/01/27 05:13:31 INFO tool.ImportTool: (Consider saving this with 'sqoop job --create')
[donghua@cdh-vm ~]$ hdfs dfs -ls /user/donghua/t2/
Found 2 items
-rw-r--r-- 1 donghua supergroup 0 2018-01-27 05:13 /user/donghua/t2/_SUCCESS
-rw-r--r-- 1 donghua supergroup 72 2018-01-27 05:13 /user/donghua/t2/part-r-00000
[donghua@cdh-vm ~]$ hdfs dfs -cat /user/donghua/t2/part-r-00000
1,2018-01-27 04:50:07.0
2,2018-01-27 05:10:14.0
3,2018-01-27 05:09:45.0
[donghua@cdh-vm ~]$
Use sqoop export to move data from HDFS into MySQL
MariaDB [employees]> create table current_dept_emp2 as select * from current_dept_emp where 1=2;
Query OK, 0 rows affected (0.05 sec)
Records: 0 Duplicates: 0 Warnings: 0
[donghua@cdh-vm ~]$ sqoop export --connect jdbc:mysql://cdh-vm.dbaglobe.com/employees --username employee_user --password password --table current_dept_emp2 --export-dir /user/donghua/current_dept_emp
Warning: /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/bin/../lib/sqoop/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
18/01/27 05:43:54 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.13.1
18/01/27 05:43:54 WARN tool.BaseSqoopTool: Setting your password on the command-line is insecure. Consider using -P instead.
18/01/27 05:43:55 INFO manager.MySQLManager: Preparing to use a MySQL streaming resultset.
18/01/27 05:43:55 INFO tool.CodeGenTool: Beginning code generation
18/01/27 05:43:55 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `current_dept_emp2` AS t LIMIT 1
18/01/27 05:43:55 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `current_dept_emp2` AS t LIMIT 1
18/01/27 05:43:55 INFO orm.CompilationManager: HADOOP_MAPRED_HOME is /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce
Note: /tmp/sqoop-donghua/compile/4eb832477301808137f8d255765ba2ca/current_dept_emp2.java uses or overrides a deprecated API.
Note: Recompile with -Xlint:deprecation for details.
18/01/27 05:43:56 INFO orm.CompilationManager: Writing jar file: /tmp/sqoop-donghua/compile/4eb832477301808137f8d255765ba2ca/current_dept_emp2.jar
18/01/27 05:43:56 INFO mapreduce.ExportJobBase: Beginning export of current_dept_emp2
18/01/27 05:43:57 INFO Configuration.deprecation: mapred.jar is deprecated. Instead, use mapreduce.job.jar
18/01/27 05:43:58 INFO Configuration.deprecation: mapred.reduce.tasks.speculative.execution is deprecated. Instead, use mapreduce.reduce.speculative
18/01/27 05:43:58 INFO Configuration.deprecation: mapred.map.tasks.speculative.execution is deprecated. Instead, use mapreduce.map.speculative
18/01/27 05:43:58 INFO Configuration.deprecation: mapred.map.tasks is deprecated. Instead, use mapreduce.job.maps
18/01/27 05:43:58 INFO client.RMProxy: Connecting to ResourceManager at cdh-vm.dbaglobe.com/192.168.56.10:8032
18/01/27 05:44:00 INFO input.FileInputFormat: Total input paths to process : 1
18/01/27 05:44:00 INFO input.FileInputFormat: Total input paths to process : 1
18/01/27 05:44:00 INFO mapreduce.JobSubmitter: number of splits:4
18/01/27 05:44:01 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1517023991003_0018
18/01/27 05:44:01 INFO impl.YarnClientImpl: Submitted application application_1517023991003_0018
18/01/27 05:44:01 INFO mapreduce.Job: The url to track the job: http://cdh-vm.dbaglobe.com:8088/proxy/application_1517023991003_0018/
18/01/27 05:44:01 INFO mapreduce.Job: Running job: job_1517023991003_0018
18/01/27 05:44:08 INFO mapreduce.Job: Job job_1517023991003_0018 running in uber mode : false
18/01/27 05:44:08 INFO mapreduce.Job: map 0% reduce 0%
18/01/27 05:44:16 INFO mapreduce.Job: map 25% reduce 0%
18/01/27 05:44:22 INFO mapreduce.Job: map 50% reduce 0%
18/01/27 05:44:28 INFO mapreduce.Job: map 75% reduce 0%
18/01/27 05:44:34 INFO mapreduce.Job: map 100% reduce 0%
18/01/27 05:44:35 INFO mapreduce.Job: Job job_1517023991003_0018 completed successfully
18/01/27 05:44:35 INFO mapreduce.Job: Counters: 30
File System Counters
FILE: Number of bytes read=0
FILE: Number of bytes written=695328
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=10241715
HDFS: Number of bytes written=0
HDFS: Number of read operations=19
HDFS: Number of large read operations=0
HDFS: Number of write operations=0
Job Counters
Launched map tasks=4
Data-local map tasks=4
Total time spent by all maps in occupied slots (ms)=20479
Total time spent by all reduces in occupied slots (ms)=0
Total time spent by all map tasks (ms)=20479
Total vcore-milliseconds taken by all map tasks=20479
Total megabyte-milliseconds taken by all map tasks=31455744
Map-Reduce Framework
Map input records=300024
Map output records=300024
Input split bytes=711
Spilled Records=0
Failed Shuffles=0
Merged Map outputs=0
GC time elapsed (ms)=332
CPU time spent (ms)=15020
Physical memory (bytes) snapshot=1057984512
Virtual memory (bytes) snapshot=11192446976
Total committed heap usage (bytes)=862453760
File Input Format Counters
Bytes Read=0
File Output Format Counters
Bytes Written=0
18/01/27 05:44:35 INFO mapreduce.ExportJobBase: Transferred 9.7673 MB in 37.4601 seconds (266.9952 KB/sec)
18/01/27 05:44:35 INFO mapreduce.ExportJobBase: Exported 300024 records.
MariaDB [employees]> select count(*) from current_dept_emp2;
+----------+
| count(*) |
+----------+
| 300024 |
+----------+
1 row in set (0.09 sec)
Convert Excel into CSV using pandas
warning: setlocale: LC_CTYPE: cannot change locale (UTF-8)
Last login: Fri Feb 2 07:19:47 2018 from 192.168.31.177
-bash: warning: setlocale: LC_CTYPE: cannot change locale (UTF-8): No such file or directory
[donghua@localhost ~]$ vi /etc/environment
# Add following 2 lines
LANG=en_US.utf-8
LC_ALL=en_US.utf-8
Donghuas-MacBook-Air:~ donghua$ ssh 192.168.31.5
Last login: Fri Feb 2 07:25:15 2018 from 192.168.31.177
[donghua@localhost ~]$
Use Python script to manage Cloudera CDH services
Change timezone on Redhat EL7
Local time: Sat 2018-02-03 18:51:25 EST
Universal time: Sat 2018-02-03 23:51:25 UTC
RTC time: Sat 2018-02-03 23:51:22
Time zone: America/New_York (EST, -0500)
NTP enabled: yes
NTP synchronized: yes
RTC in local TZ: no
DST active: no
Last DST change: DST ended at
Sun 2017-11-05 01:59:59 EDT
Sun 2017-11-05 01:00:00 EST
Next DST change: DST begins (the clock jumps one hour forward) at
Sun 2018-03-11 01:59:59 EST
Sun 2018-03-11 03:00:00 EDT
[root@cdh-vm ~]# timedatectl list-timezones |grep -i singapore
Asia/Singapore
[root@cdh-vm ~]# timedatectl set-timezone Asia/Singapore
[root@cdh-vm ~]# timedatectl
Local time: Sun 2018-02-04 07:52:04 +08
Universal time: Sat 2018-02-03 23:52:04 UTC
RTC time: Sat 2018-02-03 23:52:01
Time zone: Asia/Singapore (+08, +0800)
NTP enabled: yes
NTP synchronized: yes
RTC in local TZ: no
DST active: n/a
Fix "you are accessing a non-optimized Hue" message with Proxy/LB setup
You are accessing a non-optimized Hue, please switch to one of the available addresses: http://cdh-vm.dbaglobe.com:8889
How to fix:
Hue -> Configuration -> Hue Service Advanced Configuration Snippet (Safety Valve) for hue_safety_valve.ini
[desktop]
hue_load_balancer=http://192.168.31.5:8889,http://cdh-vm.dbaglobe.com:8889
warning: "set mapreduce.framework.name=local" terminates hiveserver2 server process
No rows affected (0.015 seconds)
0: jdbc:hive2://cdh-vm:10000/employees> select id,count(*) num from t1 group by id order by num;
Unknown HS2 problem when communicating with Thrift server.
Error: org.apache.thrift.transport.TTransportException: java.net.SocketException: Broken pipe (Write failed) (state=08S01,code=0)
# /var/run/cloudera-scm-agent/process/175-hive-HIVESERVER2/logs/stderr.log
Job running in-process (local Hadoop)
+ ps -p 898 -c
+ grep java
+ RET=0
+ '[' 0 -eq 0 ']'
+ TARGET=898
++ date
+ echo Tue Feb 6 22:19:34 +08 2018
+ kill -9 898
SQL Server JDBC integrated authentication error
com.microsoft.sqlserver.jdbc.SQLServerException: This driver is not configured for integrated authentication. ClientConnectionId:b31236b3-c830-45c9-bdb0-8e9ecbe01476
at com.microsoft.sqlserver.jdbc.SQLServerConnection.terminate(SQLServerConnection.java:2400)
at com.microsoft.sqlserver.jdbc.AuthenticationJNI.
at com.microsoft.sqlserver.jdbc.SQLServerConnection.logon(SQLServerConnection.java:3132)
at com.microsoft.sqlserver.jdbc.SQLServerConnection.access$100(SQLServerConnection.java:43)
at com.microsoft.sqlserver.jdbc.SQLServerConnection$LogonCommand.doExecute(SQLServerConnection.java:3123)
at com.microsoft.sqlserver.jdbc.TDSCommand.execute(IOBuffer.java:7505)
at com.microsoft.sqlserver.jdbc.SQLServerConnection.executeCommand(SQLServerConnection.java:2445)
at com.microsoft.sqlserver.jdbc.SQLServerConnection.connectHelper(SQLServerConnection.java:1981)
at com.microsoft.sqlserver.jdbc.SQLServerConnection.login(SQLServerConnection.java:1628)
at com.microsoft.sqlserver.jdbc.SQLServerConnection.connectInternal(SQLServerConnection.java:1459)
at com.microsoft.sqlserver.jdbc.SQLServerConnection.connect(SQLServerConnection.java:773)
at com.microsoft.sqlserver.jdbc.SQLServerDriver.connect(SQLServerDriver.java:1168)
at org.talend.core.model.metadata.builder.database.DriverShim.connect(DriverShim.java:41)
at org.talend.core.model.metadata.builder.database.ExtractMetaDataUtils.connect(ExtractMetaDataUtils.java:1069)
at org.talend.core.model.metadata.builder.database.ExtractMetaDataFromDataBase.testConnection(ExtractMetaDataFromDataBase.java:315)
at org.talend.metadata.managment.repository.ManagerConnection.check(ManagerConnection.java:289)
at org.talend.repository.ui.wizards.metadata.connection.database.DatabaseForm$62.runWithCancel(DatabaseForm.java:3983)
at org.talend.repository.ui.wizards.metadata.connection.database.DatabaseForm$62.runWithCancel(DatabaseForm.java:1)
at org.talend.repository.ui.dialog.AProgressMonitorDialogWithCancel$1.runnableWithCancel(AProgressMonitorDialogWithCancel.java:77)
at org.talend.repository.ui.dialog.AProgressMonitorDialogWithCancel$ARunnableWithProgressCancel$1.call(AProgressMonitorDialogWithCancel.java:161)
at java.util.concurrent.FutureTask.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
Caused by: java.lang.UnsatisfiedLinkError: no sqljdbc_auth in java.library.path
at java.lang.ClassLoader.loadLibrary(Unknown Source)
at java.lang.Runtime.loadLibrary0(Unknown Source)
at java.lang.System.loadLibrary(Unknown Source)
at com.microsoft.sqlserver.jdbc.AuthenticationJNI.
... 20 more
How to fix:
Copy OS architecture matching file (x32 or x64) into c:\Windows or C:\Windows\system32
PS C:\Users\Administrator> copy C:\Donghua\TOD-DI\sqljdbc_6.0\enu\auth\x64\sqljdbc_auth.dll C:\Windows\
PS C:\Users\Administrator> dir C:\Windows\sqljdbc_auth.dll
Directory: C:\Windows
Mode LastWriteTime Length Name
---- ------------- ------ ----
-a---- 1/17/2017 11:44 AM 310088 sqljdbc_auth.dll
How to upgrade outdated Python packages
pycurl: libcurl link-time ssl backend (nss) is different from compile-time ssl backend (none/other)
There was a problem importing one of the Python modules
required to run yum. The error leading to this problem was:
pycurl: libcurl link-time ssl backend (nss) is different from compile-time ssl backend (none/other)
Please install a package which provides this module, or
verify that the module is installed correctly.
It's possible that the above module doesn't match the
current version of Python, which is:
2.7.5 (default, Aug 4 2017, 00:39:18)
[GCC 4.8.5 20150623 (Red Hat 4.8.5-16)]
If you cannot solve this problem yourself, please go to
the yum faq at:
http://yum.baseurl.org/wiki/Faq
How to fix:
remove existing pycurl installation
[root@cdh-vm logs]# pip uninstall pycurl
export variable with your link-time ssl backend (which is nss above)
[root@cdh-vm logs]# export PYCURL_SSL_LIBRARY=nss
[root@cdh-vm logs]# pip install pycurl
Learning Apache Pig Chap 2 (Oreilly)
https://resources.oreilly.com/examples/0636920047704/blob/master/Learning%20Apache%20Pig%20-%20Working%20Files/Chapter%202/cities_small.txt
https://resources.oreilly.com/examples/0636920047704/blob/master/Learning%20Apache%20Pig%20-%20Working%20Files/Chapter%202/states.txt
[donghua@cdh-vm temp]$ pig -4 log4j.properties
grunt> cities = load 'cities_small.txt' as (name:chararray,state:chararray,pop:int);
grunt> aliases;
grunt> describe cities
cities: {name: chararray,state: chararray,pop: int}
grunt> \de cities
cities: {name: chararray,state: chararray,pop: int}
grunt> ca_cities = filter cities by (state=='CA');
grunt> dump ca_cities;
grunt> \d ca_cities
grunt> illustrate;
(South Gate,CA,96640)
--------------------------------------------------------------------
| cities | name:chararray | state:chararray | pop:int |
--------------------------------------------------------------------
| | South Gate | CA | 96640 |
--------------------------------------------------------------------
grunt> illustrate;
(Fresno,CA,476050)
--------------------------------------------------------------------
| cities | name:chararray | state:chararray | pop:int |
--------------------------------------------------------------------
| | Fresno | CA | 476050 |
--------------------------------------------------------------------
grunt> ordered_cities = order cities by pop desc;
grunt> states = load 'states.txt' as (rank:int,code:chararray,fullname:chararray,date_entered:chararray,year_entered:int);
grunt> cities_join_states = join cities by state, states by code;
grunt> illustrate cities_join_states;
(Fargo,ND,93531)
(39,ND,North Dakota,02-NOV,1889)
--------------------------------------------------------------------
| cities | name:chararray | state:chararray | pop:int |
--------------------------------------------------------------------
| | Fargo | ND | 93531 |
| | Fargo | ND | 93531 |
--------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------
| states | rank:int | code:chararray | fullname:chararray | date_entered:chararray | year_entered:int |
--------------------------------------------------------------------------------------------------------------------------
| | 39 | ND | North Dakota | 02-NOV | 1889 |
| | 39 | ND | North Dakota | 02-NOV | 1889 |
--------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
| cities_join_states | cities::name:chararray | cities::state:chararray | cities::pop:int | states::rank:int | states::code:chararray | states::fullname:chararray | states::date_entered:chararray | states::year_entered:int |
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
| | Fargo | ND | 93531 | 39 | ND | North Dakota | 02-NOV | 1889 |
| | Fargo | ND | 93531 | 39 | ND | North Dakota | 02-NOV | 1889 |
| | Fargo | ND | 93531 | 39 | ND | North Dakota | 02-NOV | 1889 |
| | Fargo | ND | 93531 | 39 | ND | North Dakota | 02-NOV | 1889 |
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
grunt> cities_join_states_short = foreach cities_join_states generate cities::name, states::fullname;
grunt> store cities_join_states_short into 'cities_join_states_short';
grunt> fs -ls cities_join_states_short
grunt> fs -cat cities_join_states_short/part-r-00000
grunt> cities_join_states_short = foreach (join cities by state, states by code) generate cities::name, states::fullname;
grunt> city_and_state = foreach cities generate name,state,pop*1.5;
grunt> cities_by_state = group cities by state;
grunt> \de cities_by_state;
cities_by_state: {group: chararray,cities: {(name: chararray,state: chararray,pop: int)}}
grunt> illustrate cities_by_state;
(Sioux Falls,SD,154997)
-----------------------------------------------------------------------
| cities | name:chararray | state:chararray | pop:int |
-----------------------------------------------------------------------
| | Sioux Falls | SD | 154997 |
| | Rapid City | SD | 65491 |
-----------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------
| cities_by_state | group:chararray | cities:bag{:tuple(name:chararray,state:chararray,pop:int)} |
------------------------------------------------------------------------------------------------------------------------------
| | SD | {(Sioux Falls, SD, 154997), (Rapid City, SD, 65491)} |
------------------------------------------------------------------------------------------------------------------------------
grunt> total_cities = foreach (group cities all) generate COUNT(cities);
grunt> \d total_cities;
(500)
grunt> cities_by_state = foreach (group cities by state) generate group, COUNT(cities);
grunt> \d cities_by_state;
grunt> cities_by_state = foreach (group cities by state parallel 3) generate group, COUNT(cities);
grunt> store cities_by_state into 'cities_by_state';
grunt> fs -ls cities_by_state
Found 4 items
-rw-r--r-- 1 donghua supergroup 0 2018-02-17 22:25 cities_by_state/_SUCCESS
-rw-r--r-- 1 donghua supergroup 113 2018-02-17 22:25 cities_by_state/part-r-00000
-rw-r--r-- 1 donghua supergroup 82 2018-02-17 22:25 cities_by_state/part-r-00001
-rw-r--r-- 1 donghua supergroup 86 2018-02-17 22:25 cities_by_state/part-r-00002
Complex data process in SQL vs Pig
[donghua@cdh-vm temp]$ hcat -e "desc employees.departments"
dept_no string
dept_name string
[donghua@cdh-vm temp]$ hcat -e "desc employees.dept_manager"
emp_no int
dept_no string
from_date string
to_date string
[donghua@cdh-vm temp]$ hcat -e "desc employees.dept_emp"
emp_no int
dept_no string
from_date string
to_date string
[donghua@cdh-vm temp]$ hcat -e "desc employees.employees"
emp_no int
birth_date string
first_name string
last_name string
gender string
hire_date string
-- Find out their manager name & department size
select d.dept_name, concat(m.first_name,'',m.last_name) as manager, count(e.emp_no) employees
from employees.departments d
join employees.dept_manager dm on d.dept_no = dm.dept_no
join employees.employees m on dm.emp_no = m.emp_no
join employees.dept_emp de on d.dept_no = de.dept_no
join employees.employees e on de.emp_no = e.emp_no
where de.to_date >'2018-01-01'
and dm.to_date > '2018-01-01'
group by d.dept_name, concat(m.first_name,'',m.last_name)
order by d.dept_name;
Run SQL in Hive:
Connecting to jdbc:hive2://cdh-vm.dbaglobe.com:10000/employees
Connected to: Apache Hive (version 1.1.0-cdh5.14.0)
Driver: Hive JDBC (version 1.1.0-cdh5.14.0)
Transaction isolation: TRANSACTION_REPEATABLE_READ
Beeline version 1.1.0-cdh5.14.0 by Apache Hive
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/emp> -- Find out their manager name & department size
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/emp> select d.dept_name, concat(m.first_name,'',m.last_name) as manager, count(e.emp_no) employees
. . . . . . . . . . . . . . . . . . . . . . .> from employees.departments d
. . . . . . . . . . . . . . . . . . . . . . .> join employees.dept_manager dm on d.dept_no = dm.dept_no
. . . . . . . . . . . . . . . . . . . . . . .> join employees.employees m on dm.emp_no = m.emp_no
. . . . . . . . . . . . . . . . . . . . . . .> join employees.dept_emp de on d.dept_no = de.dept_no
. . . . . . . . . . . . . . . . . . . . . . .> join employees.employees e on de.emp_no = e.emp_no
. . . . . . . . . . . . . . . . . . . . . . .> where de.to_date >'2018-01-01'
. . . . . . . . . . . . . . . . . . . . . . .> and dm.to_date > '2018-01-01'
. . . . . . . . . . . . . . . . . . . . . . .> group by d.dept_name, concat(m.first_name,'',m.last_name)
. . . . . . . . . . . . . . . . . . . . . . .> order by d.dept_name;
+---------------------+--------------------+------------+--+
| d.dept_name | manager | employees |
+---------------------+--------------------+------------+--+
| Customer Service | Yuchang Weedman | 17569 |
| Development | Leon DasSarma | 61386 |
| Finance | Isamu Legleitner | 12437 |
| Human Resources | Karsten Sigstam | 12898 |
| Marketing | Vishwani Minakawa | 14842 |
| Production | Oscar Ghazalie | 53304 |
| Quality Management | Dung Pesch | 14546 |
| Research | Hilary Kambil | 15441 |
| Sales | Hauke Zhang | 37701 |
+---------------------+--------------------+------------+--+
9 rows selected (100.528 seconds)
Run SQL in Impala:
Connected to cdh-vm.dbaglobe.com:21000
Server version: impalad version 2.11.0-cdh5.14.0 RELEASE (build d68206561bce6b26762d62c01a78e6cd27aa7690)
***********************************************************************************
Welcome to the Impala shell.
(Impala Shell v2.11.0-cdh5.14.0 (d682065) built on Sat Jan 6 13:27:16 PST 2018)
Press TAB twice to see a list of available commands.
***********************************************************************************
[cdh-vm.dbaglobe.com:21000] > -- Find out their manager name & department size
> select d.dept_name, concat(m.first_name,'',m.last_name) as manager, count(e.emp_no) employees
> from employees.departments d
> join employees.dept_manager dm on d.dept_no = dm.dept_no
> join employees.employees m on dm.emp_no = m.emp_no
> join employees.dept_emp de on d.dept_no = de.dept_no
> join employees.employees e on de.emp_no = e.emp_no
> where de.to_date >'2018-01-01'
> and dm.to_date > '2018-01-01'
> group by d.dept_name, concat(m.first_name,'',m.last_name)
> order by d.dept_name;
Query: -- Find out their manager name & department size
select d.dept_name, concat(m.first_name,'',m.last_name) as manager, count(e.emp_no) employees
from employees.departments d
join employees.dept_manager dm on d.dept_no = dm.dept_no
join employees.employees m on dm.emp_no = m.emp_no
join employees.dept_emp de on d.dept_no = de.dept_no
join employees.employees e on de.emp_no = e.emp_no
where de.to_date >'2018-01-01'
and dm.to_date > '2018-01-01'
group by d.dept_name, concat(m.first_name,'',m.last_name)
order by d.dept_name
Query submitted at: 2018-02-18 20:58:51 (Coordinator: http://cdh-vm.dbaglobe.com:25000)
Query progress can be monitored at: http://cdh-vm.dbaglobe.com:25000/query_plan?query_id=a04e8317637c0e4a:a83017f00000000
+--------------------+-------------------+-----------+
| dept_name | manager | employees |
+--------------------+-------------------+-----------+
| Customer Service | Yuchang Weedman | 17569 |
| Development | Leon DasSarma | 61386 |
| Finance | Isamu Legleitner | 12437 |
| Human Resources | Karsten Sigstam | 12898 |
| Marketing | Vishwani Minakawa | 14842 |
| Production | Oscar Ghazalie | 53304 |
| Quality Management | Dung Pesch | 14546 |
| Research | Hilary Kambil | 15441 |
| Sales | Hauke Zhang | 37701 |
+--------------------+-------------------+-----------+
Fetched 9 row(s) in 19.43s
Run SQL in Mysql:
Server version: 5.5.56-MariaDB MariaDB Server
Copyright (c) 2000, 2017, Oracle, MariaDB Corporation Ab and others.
Type 'help;' or '\h' for help. Type '\c' to clear the current input statement.
MariaDB [employees]> -- Find out their manager name & department size
MariaDB [employees]> select d.dept_name, concat(m.first_name,'',m.last_name) as manager, count(e.emp_no) employees
-> from employees.departments d
-> join employees.dept_manager dm on d.dept_no = dm.dept_no
-> join employees.employees m on dm.emp_no = m.emp_no
-> join employees.dept_emp de on d.dept_no = de.dept_no
-> join employees.employees e on de.emp_no = e.emp_no
-> where de.to_date >'2018-01-01'
-> and dm.to_date > '2018-01-01'
-> group by d.dept_name, concat(m.first_name,'',m.last_name)
-> order by d.dept_name;
+--------------------+-------------------+-----------+
| dept_name | manager | employees |
+--------------------+-------------------+-----------+
| Customer Service | Yuchang Weedman | 17569 |
| Development | Leon DasSarma | 61386 |
| Finance | Isamu Legleitner | 12437 |
| Human Resources | Karsten Sigstam | 12898 |
| Marketing | Vishwani Minakawa | 14842 |
| Production | Oscar Ghazalie | 53304 |
| Quality Management | Dung Pesch | 14546 |
| Research | Hilary Kambil | 15441 |
| Sales | Hauke Zhang | 37701 |
+--------------------+-------------------+-----------+
9 rows in set (1.62 sec)
Re-write SQL in Pig-Latin:
-- pig script
-- Find out department size and their manager name
d0 = LOAD 'employees.departments' USING org.apache.hive.hcatalog.pig.HCatLoader();
dm0 = LOAD 'employees.dept_manager' USING org.apache.hive.hcatalog.pig.HCatLoader();
de0 = LOAD 'employees.dept_emp' USING org.apache.hive.hcatalog.pig.HCatLoader();
e0 = LOAD 'employees.employees' USING org.apache.hive.hcatalog.pig.HCatLoader();
d1 = FOREACH d0 GENERATE dept_no, dept_name;
dm1 = FOREACH (FILTER dm0 BY to_date >'2018-01-01') GENERATE dept_no, emp_no;
de1 = FOREACH (FILTER de0 by to_date >'2018-01-01') GENERATE dept_no, emp_no;
e1 = FOREACH e0 GENERATE emp_no, CONCAT(first_name,'',last_name) AS fullname;
d1_dm1 = JOIN d1 BY dept_no, dm1 BY dept_no;
d1_dm1_e1 = JOIN d1_dm1 BY emp_no, e1 BY emp_no;
dept_mgr = FOREACH d1_dm1_e1 GENERATE d1_dm1::d1::dept_no AS dept_no, d1_dm1::d1::dept_name AS dept_name, e1::fullname AS manager;
d1_de1 = JOIN d1 BY dept_no, de1 BY dept_no;
d1_de1_e1 = FOREACH (JOIN d1_de1 BY emp_no, e1 BY emp_no) GENERATE d1_de1::d1::dept_no,e1::emp_no;
dept_emp_count = FOREACH (GROUP d1_de1_e1 BY dept_no) GENERATE group AS dept_no, COUNT(d1_de1_e1) AS employees;
dept_info_0 = JOIN dept_mgr BY dept_no, dept_emp_count BY dept_no;
dept_info_1 = FOREACH dept_info_0 GENERATE dept_mgr::dept_name AS dept_name, dept_mgr::manager AS manager, dept_emp_count::employees AS employees;
dept_info_2 = ORDER dept_info_0 BY dept_name;
DUMP dept_info_2;
[donghua@cdh-vm temp]$ date;pig -4 log4j.properties emp.pig;date;
Sun Feb 18 22:09:44 +08 2018
(d009,Customer Service,Yuchang Weedman,d009,17569)
(d005,Development,Leon DasSarma,d005,61386)
(d002,Finance,Isamu Legleitner,d002,12437)
(d003,Human Resources,Karsten Sigstam,d003,12898)
(d001,Marketing,Vishwani Minakawa,d001,14842)
(d004,Production,Oscar Ghazalie,d004,53304)
(d006,Quality Management,Dung Pesch,d006,14546)
(d008,Research,Hilary Kambil,d008,15441)
(d007,Sales,Hauke Zhang,d007,37701)
Sun Feb 18 22:16:08 +08 2018
Apache Reverse Proxy Example for Cloudera Yarn
# Yarn Resource Manager
Listen 192.168.31.14:8088
ProxyPreserveHost On
ProxyPass / http://cdh-vm.dbaglobe.com:8088/
ProxyPassReverse / http://cdh-vm.dbaglobe.com:8088/
# Yarn Node Manager
Listen 192.168.31.14:8042
ProxyPreserveHost On
ProxyPass / http://cdh-vm.dbaglobe.com:8042/
ProxyPassReverse / http://cdh-vm.dbaglobe.com:8042/
# Yarn JobHistory Server
Listen 192.168.31.14:19888
ProxyPreserveHost On
ProxyPass / http://cdh-vm.dbaglobe.com:19888/
ProxyPassReverse / http://cdh-vm.dbaglobe.com:19888/
Example to load CSV with newline characters within data into Hadoop tables
[donghua@cdh-vm source]$ cat newline.txt
id,text
1,"a
b"
2,"c"
3,"新年快乐"
[donghua@cdh-vm source]$ cat convert_csv_to_parquet.py
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
df = pd.read_csv('newline.txt')
# Convert from pandas to Arrow
table = pa.Table.from_pandas(df)
pq.write_table(table,'newline.parquet')
[donghua@cdh-vm source]$ python convert_csv_to_parquet.py
[donghua@cdh-vm source]$ parquet-tools cat -j newline.parquet
{"id":1,"text":"YQpi","__index_level_0__":0}
{"id":2,"text":"Yw==","__index_level_0__":1}
{"id":3,"text":"5paw5bm05b+r5LmQ","__index_level_0__":2}
[donghua@cdh-vm source]$ hdfs dfs -mkdir tbl_newline_parquet
[donghua@cdh-vm source]$ hdfs dfs -put newline.parquet tbl_newline_parquet/
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> -- Hive syntax
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> create external table tbl_newline_3
. . . . . . . . . . . . . . . . . . . . . . .> (id bigint, text string)
. . . . . . . . . . . . . . . . . . . . . . .> stored as parquet
. . . . . . . . . . . . . . . . . . . . . . .> location '/user/donghua/tbl_newline_parquet';
No rows affected (0.114 seconds)
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select * from tbl_newline_3;
+-------------------+---------------------+--+
| tbl_newline_3.id | tbl_newline_3.text |
+-------------------+---------------------+--+
| 1 | a
b |
| 2 | c |
| 3 | 新年快乐 |
+-------------------+---------------------+--+
3 rows selected (0.132 seconds)
[cdh-vm.dbaglobe.com:21000] > -- impala syntax
> create external table tbl_newline_2
> LIKE PARQUET '/user/donghua/tbl_newline_parquet/newline.parquet'
> stored as parquet
> location '/user/donghua/tbl_newline_parquet';
[cdh-vm.dbaglobe.com:21000] > desc tbl_newline_2;
+-------------------+--------+-----------------------------+
| name | type | comment |
+-------------------+--------+-----------------------------+
| id | bigint | Inferred from Parquet file. |
| text | string | Inferred from Parquet file. |
| __index_level_0__ | bigint | Inferred from Parquet file. |
+-------------------+--------+-----------------------------+
Fetched 3 row(s) in 0.02s
[cdh-vm.dbaglobe.com:21000] > select * from tbl_newline_2;
+----+----------+-------------------+
| id | text | __index_level_0__ |
+----+----------+-------------------+
| 1 | a | 0 |
| | b | |
| 2 | c | 1 |
| 3 | 新年快乐 | 2 |
+----+----------+-------------------+
Fetched 3 row(s) in 5.25s