Quantcast
Channel: Donghua's Blog - DBAGlobe
Viewing all 604 articles
Browse latest View live

Load mass data into Hive

$
0
0
  • Method 1: load local data (text)
  • Method 2: Insert as select

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> drop table monthly_taxi_fleet3;
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> create table monthly_taxi_fleet3 (
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> month char(7), company varchar(50),fleet smallint)
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> with serdeproperties (
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def>   "separatorchar" = ",",
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def>    "quotechar"     = "",
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def>   "escapechar"    = "\\"
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> )
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> stored as textfile
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> tblproperties ("skip.header.line.count"="1");


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> load data local inpath '/tmp/monthly_taxi_fleet.csv'overwrite into table monthly_taxi_fleet3;
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select * from monthly_taxi_fleet3 limit 10;
+----------------------------+------------------------------+----------------------------+--+
| monthly_taxi_fleet3.month  | monthly_taxi_fleet3.company  | monthly_taxi_fleet3.fleet  |
+----------------------------+------------------------------+----------------------------+--+
| 2005-01                    | Comfort                      | 9952                       |
| 2005-01                    | CityCab                      | 4965                       |
| 2005-01                    | SMRT                         | 2441                       |
| 2005-01                    | YTC                          | 1223                       |
| 2005-01                    | Individual Yellow- Top       | 696                        |
| 2005-01                    | Smart                        | 320                        |
| 2005-01                    | TransCab                     | 560                        |
| 2005-01                    | Premier                      | 370                        |
| 2005-02                    | Comfort                      | 10046                      |
| 2005-02                    | CityCab                      | 4968                       |
+----------------------------+------------------------------+----------------------------+--+
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def>

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> create table monthly_taxi_fleet4 (
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> month char(7), company varchar(50),fleet smallint)
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> stored as avro;

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> insert overwrite table monthly_taxi_fleet4
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select * from monthly_taxi_fleet3 limit 10;


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select count(*) from monthly_taxi_fleet4;
+------+--+
| _c0  |
+------+--+
| 10   |
+------+--+

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> insert into table monthly_taxi_fleet4
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select * from monthly_taxi_fleet3 limit 10;


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select count(*) from monthly_taxi_fleet4;
+------+--+
| _c0  |
+------+--+
| 20   |
+------+--+


Hive alter table DDL to rename table and add/replace columns

$
0
0

Rename Tables

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> show tables;
+----------------------+--+
|       tab_name       |
+----------------------+--+
| datatypedemo         |
| employees            |
| employees2           |
| employees3           |
| hivedatatypedemo     |
| iotdatademo          |
| monthly_taxi_fleet   |
| monthly_taxi_fleet2  |
| monthly_taxi_fleet3  |
| monthly_taxi_fleet4  |
| sales_data           |
| sales_data_dup       |
+----------------------+--+
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> alter table monthly_taxi_fleet4 monthly_taxi_fleet5;
Error: Error while compiling statement: FAILED: ParseException line 1:32 cannot recognize input near 'monthly_taxi_fleet5''<EOF>''<EOF>' in alter table statement (state=42000,code=40000)

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> alter table monthly_taxi_fleet4 rename to monthly_taxi_fleet5;
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> show tables;
+----------------------+--+
|       tab_name       |
+----------------------+--+
| datatypedemo         |
| employees            |
| employees2           |
| employees3           |
| hivedatatypedemo     |
| iotdatademo          |
| monthly_taxi_fleet   |
| monthly_taxi_fleet2  |
| monthly_taxi_fleet3  |
| monthly_taxi_fleet5  |
| sales_data           |
| sales_data_dup       |
+----------------------+--+
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> show create table monthly_taxi_fleet5;
+------------------------------------------------------------------------------+--+
|                                createtab_stmt                                |
+------------------------------------------------------------------------------+--+
| CREATE TABLE `monthly_taxi_fleet5`(                                          |
|   `month` char(7) COMMENT '',                                                |
|   `company` varchar(50) COMMENT '',                                          |
|   `fleet` int COMMENT '')                                                    |
| ROW FORMAT SERDE                                                             |
|   'org.apache.hadoop.hive.serde2.avro.AvroSerDe'                             |
| STORED AS INPUTFORMAT                                                        |
|   'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'               |
| OUTPUTFORMAT                                                                 |
|   'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'              |
| LOCATION                                                                     |
|   'hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5'  |
| TBLPROPERTIES (                                                              |
|   'COLUMN_STATS_ACCURATE'='true',                                            |
|   'last_modified_by'='donghua',                                              |
|   'last_modified_time'='1514072478',                                         |
|   'numFiles'='2',                                                            |
|   'numRows'='20',                                                            |
|   'rawDataSize'='0',                                                         |
|   'totalSize'='1230',                                                        |
|   'transient_lastDdlTime'='1514072478')                                      |
+------------------------------------------------------------------------------+—+

Add columns

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> alter table monthly_taxi_fleet5 add columns (c1 date);
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> show create table monthly_taxi_fleet5;
+------------------------------------------------------------------------------+--+
|                                createtab_stmt                                |
+------------------------------------------------------------------------------+--+
| CREATE TABLE `monthly_taxi_fleet5`(                                          |
|   `month` char(7) COMMENT '',                                                |
|   `company` varchar(50) COMMENT '',                                          |
|   `fleet` int COMMENT '',                                                    |
|   `c1` date COMMENT '')                                                      |
| ROW FORMAT SERDE                                                             |
|   'org.apache.hadoop.hive.serde2.avro.AvroSerDe'                             |
| STORED AS INPUTFORMAT                                                        |
|   'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'               |
| OUTPUTFORMAT                                                                 |
|   'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'              |
| LOCATION                                                                     |
|   'hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5'  |
| TBLPROPERTIES (                                                              |
|   'COLUMN_STATS_ACCURATE'='false',                                           |
|   'last_modified_by'='donghua',                                              |
|   'last_modified_time'='1514072785',                                         |
|   'numFiles'='2',                                                            |
|   'numRows'='-1',                                                            |
|   'rawDataSize'='-1',                                                        |
|   'totalSize'='1230',                                                        |
|   'transient_lastDdlTime'='1514072785')                                      |
+------------------------------------------------------------------------------+—+

Drop Columns

REPLACE COLUMNS removes all existing columns and adds the new set of columns. This can be done only for tables with a native SerDe (DynamicSerDe, MetadataTypedColumnsetSerDe, LazySimpleSerDe and ColumnarSerDe). Refer to Hive SerDe for more information. REPLACE COLUMNS can also be used to drop columns. For example, "ALTER TABLE test_change REPLACE COLUMNS (a int, b int);" will remove column 'c' from test_change's schema.

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> alter table monthly_taxi_fleet5 replace columns (month char(7), company varchar(50),fleet smallint);
Error: Error while processing statement: FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. Replace columns is not supported for table default.monthly_taxi_fleet5. SerDe may be incompatible. (state=42000,code=1)

Use Parquet to demostrate column drop


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> drop table monthly_taxi_fleet5;
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> create table monthly_taxi_fleet5 (
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> month char(7), company varchar(50),fleet smallint)
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> stored as parquet;


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> insert overwrite table monthly_taxi_fleet5
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select * from monthly_taxi_fleet3 limit 10;


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> show create table monthly_taxi_fleet5;
+------------------------------------------------------------------------------+--+
|                                createtab_stmt                                |
+------------------------------------------------------------------------------+--+
| CREATE TABLE `monthly_taxi_fleet5`(                                          |
|   `month` char(7),                                                           |
|   `company` varchar(50),                                                     |
|   `fleet` smallint)                                                          |
| ROW FORMAT SERDE                                                             |
|   'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'              |
| STORED AS INPUTFORMAT                                                        |
|   'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'            |
| OUTPUTFORMAT                                                                 |
|   'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'           |

| LOCATION                                                                     |
|   'hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5'  |
| TBLPROPERTIES (                                                              |
|   'COLUMN_STATS_ACCURATE'='true',                                            |
|   'numFiles'='1',                                                            |
|   'numRows'='10',                                                            |
|   'rawDataSize'='30',                                                        |
|   'totalSize'='628',                                                         |
|   'transient_lastDdlTime'='1514073272')                                      |
+------------------------------------------------------------------------------+--+
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select * from monthly_taxi_fleet5;
+----------------------------+------------------------------+----------------------------+--+
| monthly_taxi_fleet5.month  | monthly_taxi_fleet5.company  | monthly_taxi_fleet5.fleet  |
+----------------------------+------------------------------+----------------------------+--+
| 2005-02                    | CityCab                      | 4968                       |
| 2005-02                    | Comfort                      | 10046                      |
| 2005-01                    | Premier                      | 370                        |
| 2005-01                    | TransCab                     | 560                        |
| 2005-01                    | Smart                        | 320                        |
| 2005-01                    | Individual Yellow- Top       | 696                        |
| 2005-01                    | YTC                          | 1223                       |
| 2005-01                    | SMRT                         | 2441                       |
| 2005-01                    | CityCab                      | 4965                       |
| 2005-01                    | Comfort                      | 9952                       |
+----------------------------+------------------------------+----------------------------+--+
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> alter table monthly_taxi_fleet5 add columns (c1 date);
Error: Error while processing statement: FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. Parquet does not support date. See HIVE-6384 (state=08S01,code=1)

https://issues.apache.org/jira/browse/HIVE-6384

Date added into Parquet in Hive 1.2.0 https://issues.apache.org/jira/browse/HIVE-8119


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> alter table monthly_taxi_fleet5 add columns (c1 timestamp);
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select * from monthly_taxi_fleet5;

+----------------------------+------------------------------+----------------------------+-------------------------+--+
| monthly_taxi_fleet5.month  | monthly_taxi_fleet5.company  | monthly_taxi_fleet5.fleet  | monthly_taxi_fleet5.c1  |
+----------------------------+------------------------------+----------------------------+-------------------------+--+
| 2005-02                    | CityCab                      | 4968                       | NULL                    |
| 2005-02                    | Comfort                      | 10046                      | NULL                    |
| 2005-01                    | Premier                      | 370                        | NULL                    |
| 2005-01                    | TransCab                     | 560                        | NULL                    |
| 2005-01                    | Smart                        | 320                        | NULL                    |
| 2005-01                    | Individual Yellow- Top       | 696                        | NULL                    |
| 2005-01                    | YTC                          | 1223                       | NULL                    |
| 2005-01                    | SMRT                         | 2441                       | NULL                    |
| 2005-01                    | CityCab                      | 4965                       | NULL                    |
| 2005-01                    | Comfort                      | 9952                       | NULL                    |
+----------------------------+------------------------------+----------------------------+-------------------------+--+
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> show create table monthly_taxi_fleet5;
+------------------------------------------------------------------------------+--+
|                                createtab_stmt                                |
+------------------------------------------------------------------------------+--+
| CREATE TABLE `monthly_taxi_fleet5`(                                          |
|   `month` char(7),                                                           |
|   `company` varchar(50),                                                     |
|   `fleet` smallint,                                                          |
|   `c1` timestamp)                                                            |
| ROW FORMAT SERDE                                                             |
|   'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'              |
| STORED AS INPUTFORMAT                                                        |
|   'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'            |
| OUTPUTFORMAT                                                                 |
|   'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'           |
| LOCATION                                                                     |
|   'hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5'  |
| TBLPROPERTIES (                                                              |
|   'COLUMN_STATS_ACCURATE'='false',                                           |
|   'last_modified_by'='donghua',                                              |
|   'last_modified_time'='1514073343',                                         |
|   'numFiles'='1',                                                            |
|   'numRows'='-1',                                                            |
|   'rawDataSize'='-1',                                                        |
|   'totalSize'='628',                                                         |
|   'transient_lastDdlTime'='1514073343')                                      |
+------------------------------------------------------------------------------+--+
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> alter table monthly_taxi_fleet5 replace columns (month char(7), company varchar(50),fleet smallint);
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> show create table monthly_taxi_fleet5;
+------------------------------------------------------------------------------+--+
|                                createtab_stmt                                |
+------------------------------------------------------------------------------+--+
| CREATE TABLE `monthly_taxi_fleet5`(                                          |
|   `month` char(7),                                                           |
|   `company` varchar(50),                                                     |
|   `fleet` smallint)                                                          |
| ROW FORMAT SERDE                                                             |
|   'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'              |
| STORED AS INPUTFORMAT                                                        |
|   'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'            |
| OUTPUTFORMAT                                                                 |
|   'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'           |
| LOCATION                                                                     |
|   'hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5'  |
| TBLPROPERTIES (                                                              |
|   'COLUMN_STATS_ACCURATE'='false',                                           |
|   'last_modified_by'='donghua',                                              |
|   'last_modified_time'='1514073374',                                         |
|   'numFiles'='1',                                                            |
|   'numRows'='-1',                                                            |
|   'rawDataSize'='-1',                                                        |
|   'totalSize'='628',                                                         |
|   'transient_lastDdlTime'='1514073374')                                      |
+------------------------------------------------------------------------------+--+
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select * from monthly_taxi_fleet5;
+----------------------------+------------------------------+----------------------------+--+
| monthly_taxi_fleet5.month  | monthly_taxi_fleet5.company  | monthly_taxi_fleet5.fleet  |
+----------------------------+------------------------------+----------------------------+--+
| 2005-02                    | CityCab                      | 4968                       |
| 2005-02                    | Comfort                      | 10046                      |
| 2005-01                    | Premier                      | 370                        |
| 2005-01                    | TransCab                     | 560                        |
| 2005-01                    | Smart                        | 320                        |
| 2005-01                    | Individual Yellow- Top       | 696                        |
| 2005-01                    | YTC                          | 1223                       |
| 2005-01                    | SMRT                         | 2441                       |
| 2005-01                    | CityCab                      | 4965                       |
| 2005-01                    | Comfort                      | 9952                       |
+----------------------------+------------------------------+----------------------------+--+
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def>

Reference URLs: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-AlterTable/Partition/Column

Hive built-in function explode example

$
0
0

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> create table explodeexample (empid smallint, score array<double>);
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> desc explodeexample;
+-----------+----------------+----------+--+
| col_name  |   data_type    | comment  |
+-----------+----------------+----------+--+
| empid     | smallint       |          |
| score     | array<double>  |          |
+-----------+----------------+----------+--+
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> insert into explodeexample values (1,array(80,70.4,99.0,100));
Error: Error while compiling statement: FAILED: SemanticException [Error 10293]: Unable to create temp file for insert values Expression of type TOK_FUNCTION not supported in insert/values (state=42000,code=10293)

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> insert into explodeexample select 1,array(80,70.4,99.0,100);
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> insert into explodeexample select 2,array(70,59.5,80,85,95.6,60);
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select * from explodeexample;
+-----------------------+----------------------------------+--+
| explodeexample.empid  |       explodeexample.score       |
+-----------------------+----------------------------------+--+
| 1                     | [80.0,70.4,99.0,100.0]           |
| 2                     | [70.0,59.5,80.0,85.0,95.6,60.0]  |
+-----------------------+----------------------------------+--+

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select empid,size(score) score_count from explodeexample;;
+--------+--------------+--+
| empid  | score_count  |
+--------+--------------+--+
| 1      | 4            |
| 2      | 6            |
+--------+--------------+--+

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select explode(score) score from explodeexample where empid=1;
+--------+--+
| score  |
+--------+--+
| 80.0   |
| 70.4   |
| 99.0   |
| 100.0  |
+--------+--+

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select empid,explode(score) score from explodeexample where empid=1;
Error: Error while compiling statement: FAILED: SemanticException [Error 10081]: UDTF's are not supported outside the SELECT clause, nor nested in expressions (state=42000,code=10081)

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select empid,exp.score from explodeexample
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def>   lateral view explode(score) exp as score;
+--------+------------+--+
| empid  | exp.score  |
+--------+------------+--+
| 1      | 80.0       |
| 1      | 70.4       |
| 1      | 99.0       |
| 1      | 100.0      |
| 2      | 70.0       |
| 2      | 59.5       |
| 2      | 80.0       |
| 2      | 85.0       |
| 2      | 95.6       |
| 2      | 60.0       |
+--------+------------+—+

Alternative way to insert values into complex data type using UNION ALL

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> insert into explodeexample
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select 3,array(82,73.2,96.0)union all
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select 4,array(56,85.3,82.0,99.9)union all
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select 5,array(65,93.0)union all
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select 6,array(54,55.4,68.0,86);


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select * from explodeexample;
+-----------------------+----------------------------------+--+
| explodeexample.empid  |       explodeexample.score       |
+-----------------------+----------------------------------+--+
| 1                     | [80.0,70.4,99.0,100.0]           |
| 2                     | [70.0,59.5,80.0,85.0,95.6,60.0]  |
| 3                     | [82.0,73.2,96.0]                 |
| 4                     | [56.0,85.3,82.0,99.9]            |
| 5                     | [65.0,93.0]                      |
| 6                     | [54.0,55.4,68.0,86.0]            |
+-----------------------+----------------------------------+--+

Other tips to work with Hive functions:

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> desc function case;
+----------------------------------------------------+--+
|                      tab_name                      |
+----------------------------------------------------+--+
| CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END - When a = b, returns c; when a = d, return e; else return f |
+----------------------------------------------------+--+
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> desc function extended case;
+----------------------------------------------------+--+
|                      tab_name                      |
+----------------------------------------------------+--+
| CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END - When a = b, returns c; when a = d, return e; else return f |
| Example:                                           |
|  SELECT                                            |
|  CASE deptno                                       |
|    WHEN 1 THEN Engineering                         |
|    WHEN 2 THEN Finance                             |
|    ELSE admin                                      |
|  END,                                              |
|  CASE zone                                         |
|    WHEN 7 THEN Americas                            |
|    ELSE Asia-Pac                                   |
|  END                                               |
|  FROM emp_details                                  |
+----------------------------------------------------+—+

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> show functions;
+-------------------------+--+
|        tab_name         |
+-------------------------+--+
| !                       |
| !=                      |
| %                       |
| &                       |
| *                       |
| +                       |
| -                       |
| /                       |
| <                       |
| <=                      |
| <=>                     |
| <>                      |
| =                       |
| ==                      |
| >                       |
| >=                      |
| ^                       |
| abs                     |
| acos                    |
| add_months              |
| and                     |
| array                   |
| array_contains          |
| ascii                   |
| asin                    |
| assert_true             |
| atan                    |
| avg                     |
| base64                  |
| between                 |
| bin                     |
| case                    |
| cbrt                    |
| ceil                    |
| ceiling                 |
| coalesce                |
| collect_list            |
| collect_set             |
| compute_stats           |
| concat                  |
| concat_ws               |
| context_ngrams          |
| conv                    |
| corr                    |
| cos                     |
| count                   |
| covar_pop               |
| covar_samp              |
| crc32                   |
| create_union            |
| cume_dist               |
| current_database        |
| current_date            |
| current_timestamp       |
| current_user            |
| date_add                |
| date_format             |
| date_sub                |
| datediff                |
| day                     |
| dayofmonth              |
| dayofweek               |
| decode                  |
| degrees                 |
| dense_rank              |
| div                     |
| e                       |
| elt                     |
| encode                  |
| ewah_bitmap             |
| ewah_bitmap_and         |
| ewah_bitmap_empty       |
| ewah_bitmap_or          |
| exp                     |
| explode                 |
| field                   |
| find_in_set             |
| first_value             |
| floor                   |
| format_number           |
| from_unixtime           |
| from_utc_timestamp      |
| get_json_object         |
| greatest                |
| hash                    |
| hex                     |
| histogram_numeric       |
| hour                    |
| if                      |
| in                      |
| in_file                 |
| index                   |
| initcap                 |
| inline                  |
| instr                   |
| isnotnull               |
| isnull                  |
| java_method             |
| json_tuple              |
| lag                     |
+-------------------------+--+
|        tab_name         |
+-------------------------+--+
| last_day                |
| last_value              |
| lcase                   |
| lead                    |
| least                   |
| length                  |
| levenshtein             |
| like                    |
| ln                      |
| locate                  |
| log                     |
| log10                   |
| log2                    |
| logged_in_user          |
| lower                   |
| lpad                    |
| ltrim                   |
| map                     |
| map_keys                |
| map_values              |
| matchpath               |
| max                     |
| md5                     |
| min                     |
| minute                  |
| month                   |
| months_between          |
| named_struct            |
| negative                |
| next_day                |
| ngrams                  |
| noop                    |
| noopstreaming           |
| noopwithmap             |
| noopwithmapstreaming    |
| not                     |
| ntile                   |
| nvl                     |
| or                      |
| parse_url               |
| parse_url_tuple         |
| percent_rank            |
| percentile              |
| percentile_approx       |
| pi                      |
| pmod                    |
| posexplode              |
| positive                |
| pow                     |
| power                   |
| printf                  |
| radians                 |
| rand                    |
| rank                    |
| reflect                 |
| reflect2                |
| regexp                  |
| regexp_extract          |
| regexp_replace          |
| repeat                  |
| reverse                 |
| rlike                   |
| round                   |
| row_number              |
| rpad                    |
| rtrim                   |
| second                  |
| sentences               |
| sha2                    |
| sign                    |
| sin                     |
| size                    |
| sort_array              |
| soundex                 |
| space                   |
| split                   |
| sqrt                    |
| stack                   |
| std                     |
| stddev                  |
| stddev_pop              |
| stddev_samp             |
| str_to_map              |
| struct                  |
| substr                  |
| substring               |
| sum                     |
| tan                     |
| to_date                 |
| to_unix_timestamp       |
| to_utc_timestamp        |
| translate               |
| trim                    |
| trunc                   |
| ucase                   |
| unbase64                |
| unhex                   |
| unix_timestamp          |
| upper                   |
| uuid                    |
+-------------------------+--+
|        tab_name         |
+-------------------------+--+
| var_pop                 |
| var_samp                |
| variance                |
| version                 |
| weekofyear              |
| when                    |
| windowingtablefunction  |
| xpath                   |
| xpath_boolean           |
| xpath_double            |
| xpath_float             |
| xpath_int               |
| xpath_long              |
| xpath_number            |
| xpath_short             |
| xpath_string            |
| year                    |
| |                       |
| ~                       |
+-------------------------+--+

Hive collection data type example

$
0
0

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> create table empsubquery
. . . . . . . . . . . . . . . . . . . . . . .> ( empid int,
. . . . . . . . . . . . . . . . . . . . . . .>   firstname varchar(30),
. . . . . . . . . . . . . . . . . . . . . . .>   lastname varchar(30),
. . . . . . . . . . . . . . . . . . . . . . .>   tenure int,
. . . . . . . . . . . . . . . . . . . . . . .>   address struct<street:string,city:string>,
. . . . . . . . . . . . . . . . . . . . . . .>   subordinates array<string>);


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> insert into empsubquery
. . . . . . . . . . . . . . . . . . . . . . .> select 1,'Donghua','Luo',2,named_struct("street","Serangoon Road","city","Singapore"),array("Larry","Tom") union all
. . . . . . . . . . . . . . . . . . . . . . .> select 2,'Larry','Elison',5,named_struct("street","Victor Street","city","New York"),array("Tom") union all
. . . . . . . . . . . . . . . . . . . . . . .> select 3,'Tom','Kyte',4,named_struct("street","Victor Street","city","New York"),array("Tiger","Leon") union all
. . . . . . . . . . . . . . . . . . . . . . .> select 4,'Tiger','Hood',3,named_struct("street","Eliz Road","city","London"),array("Jack");

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> !outputformat vertical
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select * from empsubquery;
empsubquery.empid         1
empsubquery.firstname     Donghua
empsubquery.lastname      Luo
empsubquery.tenure        2
empsubquery.address       {"street":"Serangoon Road","city":"Singapore"}
empsubquery.subordinates  ["Larry","Tom"]

empsubquery.empid         2
empsubquery.firstname     Larry
empsubquery.lastname      Elison
empsubquery.tenure        5
empsubquery.address       {"street":"Victor Street","city":"New York"}
empsubquery.subordinates  ["Tom"]

empsubquery.empid         3
empsubquery.firstname     Tom
empsubquery.lastname      Kyte
empsubquery.tenure        4
empsubquery.address       {"street":"Victor Street","city":"New York"}
empsubquery.subordinates  ["Tiger","Leon"]

empsubquery.empid         4
empsubquery.firstname     Tiger
empsubquery.lastname      Hood
empsubquery.tenure        3
empsubquery.address       {"street":"Eliz Road","city":"London"}
empsubquery.subordinates  ["Jack"]

4 rows selected (0.271 seconds)

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> !outputformat table
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select firstname,address.street,address.city from empsubquery;
+------------+-----------------+------------+--+
| firstname  |     street      |    city    |
+------------+-----------------+------------+--+
| Donghua    | Serangoon Road  | Singapore  |
| Larry      | Victor Street   | New York   |
| Tom        | Victor Street   | New York   |
| Tiger      | Eliz Road       | London     |
+------------+-----------------+------------+--+
4 rows selected (16.004 seconds)

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select firstname,subordinates[0] subordinates_0,subordinates[1] subordinates_1 from empsubquery;
+------------+-----------------+-----------------+--+
| firstname  | subordinates_0  | subordinates_1  |
+------------+-----------------+-----------------+--+
| Donghua    | Larry           | Tom             |
| Larry      | Tom             | NULL            |
| Tom        | Tiger           | Leon            |
| Tiger      | Jack            | NULL            |
+------------+-----------------+-----------------+--+
4 rows selected (15.304 seconds)
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def>

Hive Bitmap Indexes with example

$
0
0

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> show create table monthly_taxi_fleet5;
createtab_stmt
CREATE TABLE `monthly_taxi_fleet5`(
  `month` char(7),
  `company` varchar(50),
  `fleet` smallint)
ROW FORMAT SERDE
  'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
  'hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5'
TBLPROPERTIES (
  'COLUMN_STATS_ACCURATE'='false',
  'last_modified_by'='donghua',
  'last_modified_time'='1514073485',
  'numFiles'='1',
  'numRows'='-1',
  'rawDataSize'='-1',
  'totalSize'='628',
  'transient_lastDdlTime'='1514073485')
21 rows selected (0.069 seconds)


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> !outputformat csv2
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> explain select * from monthly_taxi_fleet5 where company='Comfort';

Explain
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-1
    Map Reduce
      Map Operator Tree:
           TableScan
            alias: monthly_taxi_fleet5
             filterExpr: (company = 'Comfort') (type: boolean)
             Statistics: Num rows: 10 Data size: 628 Basic stats: COMPLETE Column stats: NONE
            Filter Operator
              predicate: (company = 'Comfort') (type: boolean)
              Statistics: Num rows: 5 Data size: 314 Basic stats: COMPLETE Column stats: NONE
              Select Operator
                expressions: month (type: char(7)), company (type: varchar(50)), fleet (type: smallint)
                outputColumnNames: _col0, _col1, _col2
                Statistics: Num rows: 5 Data size: 314 Basic stats: COMPLETE Column stats: NONE
                File Output Operator
                  compressed: false
                   Statistics: Num rows: 5 Data size: 314 Basic stats: COMPLETE Column stats: NONE
                  table:
                       input format: org.apache.hadoop.mapred.TextInputFormat
                       output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

  Stage: Stage-0
    Fetch Operator
      limit: -1
      Processor Tree:
        ListSink

33 rows selected (0.183 seconds)

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> create index monthly_taxi_fleet5_company on table monthly_taxi_fleet5(company) as 'bitmap';
ERROR : FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. java.lang.RuntimeException: Please specify deferred rebuild using " WITH DEFERRED REBUILD ".
Error: Error while processing statement: FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. java.lang.RuntimeException: Please specify deferred rebuild using " WITH DEFERRED REBUILD ". (state=08S01,code=1)

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> create index monthly_taxi_fleet5_company on table monthly_taxi_fleet5(company) as 'bitmap' with deferred rebuild;


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> alter index monthly_taxi_fleet5_company on monthly_taxi_fleet5 rebuild;
No rows affected (21.826 seconds)

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> explain select * from monthly_taxi_fleet5 where company='Comfort';

Explain
STAGE DEPENDENCIES:
  Stage-3 is a root stage
  Stage-2 depends on stages: Stage-3
  Stage-1 depends on stages: Stage-2
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-3
    Map Reduce
      Map Operator Tree:
          TableScan
            alias: default__monthly_taxi_fleet5_monthly_taxi_fleet5_company__
            filterExpr: ((company = 'Comfort') and (not EWAH_BITMAP_EMPTY(_bitmaps))) (type: boolean)
             Filter Operator
              predicate: ((company = 'Comfort') and (not EWAH_BITMAP_EMPTY(_bitmaps))) (type: boolean)
               Select Operator
                expressions: _bucketname (type: string), _offset (type: bigint)
                outputColumnNames: _col0, _col1
                Group By Operator
                   aggregations: collect_set(_col1)
                  keys: _col0 (type: string)
                  mode: hash
                   outputColumnNames: _col0, _col1
                  Reduce Output Operator
                    key expressions: _col0 (type: string)
                    sort order: +
                     Map-reduce partition columns: _col0 (type: string)
                     value expressions: _col1 (type: array<bigint>)
      Reduce Operator Tree:
        Group By Operator
          aggregations: collect_set(VALUE._col0)
          keys: KEY._col0 (type: string)
          mode: mergepartial
          outputColumnNames: _col0, _col1
          File Output Operator
            compressed: false
            table:
                input format: org.apache.hadoop.mapred.TextInputFormat
                output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

  Stage: Stage-2
    Move Operator
      files:
           hdfs directory: true
          destination: hdfs://cdh-vm.dbaglobe.com:8020/tmp/hive/donghua/6086c443-dac8-4517-8118-a37d42ce56a4/hive_2017-12-25_19-08-59_026_6029178632909749720-6/-mr-10003

  Stage: Stage-1
    Map Reduce
      Map Operator Tree:
           TableScan
            alias: monthly_taxi_fleet5
             filterExpr: (company = 'Comfort') (type: boolean)
             Statistics: Num rows: 10 Data size: 628 Basic stats: COMPLETE Column stats: NONE
            Filter Operator
              predicate: (company = 'Comfort') (type: boolean)
              Statistics: Num rows: 5 Data size: 314 Basic stats: COMPLETE Column stats: NONE
              Select Operator
                expressions: month (type: char(7)), company (type: varchar(50)), fleet (type: smallint)
                outputColumnNames: _col0, _col1, _col2
                Statistics: Num rows: 5 Data size: 314 Basic stats: COMPLETE Column stats: NONE
                File Output Operator
                  compressed: false
                   Statistics: Num rows: 5 Data size: 314 Basic stats: COMPLETE Column stats: NONE
                  table:
                       input format: org.apache.hadoop.mapred.TextInputFormat
                       output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

  Stage: Stage-0
    Fetch Operator
      limit: -1
      Processor Tree:
        ListSink

75 rows selected (0.39 seconds)


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select * from monthly_taxi_fleet5 where company='Comfort';
INFO  : Compiling command(queryId=hive_20171225191313_3bbd504c-a4ec-4cf8-8427-fea0f832f785): select * from monthly_taxi_fleet5 where company='Comfort'
INFO  : Compiling command(queryId=hive_20171225191313_3bbd504c-a4ec-4cf8-8427-fea0f832f785): INSERT OVERWRITE DIRECTORY "hdfs://cdh-vm.dbaglobe.com:8020/tmp/hive/donghua/5a539317-9cca-43b8-b460-de23c3d7c159/hive_2017-12-25_19-13-24_864_2203428447630231527-6/-mr-10003" SELECT bucketname AS `_bucketname` , COLLECT_SET(offset) AS `_offsets` FROM (SELECT `_bucketname` AS bucketname , `_offset` AS offset FROM (SELECT * FROM `default__monthly_taxi_fleet5_monthly_taxi_fleet5_company__` WHERE (company = 'Comfort')) ind0 WHERE NOT EWAH_BITMAP_EMPTY(ind0.`_bitmaps`) ) tmp_index GROUP BY bucketname
INFO  : Semantic Analysis Completed
INFO  : Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:_bucketname, type:string, comment:null), FieldSchema(name:_offsets, type:array<bigint>, comment:null)], properties:null)
INFO  : Completed compiling command(queryId=hive_20171225191313_3bbd504c-a4ec-4cf8-8427-fea0f832f785); Time taken: 0.07 seconds
INFO  : Semantic Analysis Completed
INFO  : Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:monthly_taxi_fleet5.month, type:char(7), comment:null), FieldSchema(name:monthly_taxi_fleet5.company, type:varchar(50), comment:null), FieldSchema(name:monthly_taxi_fleet5.fleet, type:smallint, comment:null)], properties:null)
INFO  : Completed compiling command(queryId=hive_20171225191313_3bbd504c-a4ec-4cf8-8427-fea0f832f785); Time taken: 0.071 seconds
INFO  : Concurrency mode is disabled, not creating a lock manager
INFO  : Executing command(queryId=hive_20171225191313_3bbd504c-a4ec-4cf8-8427-fea0f832f785): select * from monthly_taxi_fleet5 where company='Comfort'
INFO  : Query ID = hive_20171225191313_3bbd504c-a4ec-4cf8-8427-fea0f832f785
INFO  : Total jobs = 2
INFO  : Launching Job 1 out of 2
INFO  : Starting task [Stage-3:MAPRED] in serial mode
INFO  : Number of reduce tasks not specified. Estimated from input data size: 1
INFO  : In order to change the average load for a reducer (in bytes):
INFO  :   set hive.exec.reducers.bytes.per.reducer=<number>
INFO  : In order to limit the maximum number of reducers:
INFO  :   set hive.exec.reducers.max=<number>
INFO  : In order to set a constant number of reducers:
INFO  :   set mapreduce.job.reduces=<number>
INFO  : number of splits:1
INFO  : Submitting tokens for job: job_1513984921012_0058
INFO  : The url to track the job:
http://cdh-vm.dbaglobe.com:8088/proxy/application_1513984921012_0058/
INFO  : Starting Job = job_1513984921012_0058, Tracking URL = http://cdh-vm.dbaglobe.com:8088/proxy/application_1513984921012_0058/
INFO  : Kill Command = /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/lib/hadoop/bin/hadoop job  -kill job_1513984921012_0058
INFO  : Hadoop job information for Stage-3: number of mappers: 1; number of reducers: 1
INFO  : 2017-12-25 19:13:33,536 Stage-3 map = 0%,  reduce = 0%
INFO  : 2017-12-25 19:13:38,875 Stage-3 map = 100%,  reduce = 0%, Cumulative CPU 1.62 sec
INFO  : 2017-12-25 19:13:45,406 Stage-3 map = 100%,  reduce = 100%, Cumulative CPU 3.47 sec
INFO  : MapReduce Total cumulative CPU time: 3 seconds 470 msec
INFO  : Ended Job = job_1513984921012_0058
INFO  : Starting task [Stage-2:MOVE] in serial mode
INFO  : Moving data to: hdfs://cdh-vm.dbaglobe.com:8020/tmp/hive/donghua/5a539317-9cca-43b8-b460-de23c3d7c159/hive_2017-12-25_19-13-24_864_2203428447630231527-6/-mr-10003 from hdfs://cdh-vm.dbaglobe.com:8020/tmp/hive/donghua/5a539317-9cca-43b8-b460-de23c3d7c159/hive_2017-12-25_19-13-24_864_2203428447630231527-6/-mr-10003/.hive-staging_hive_2017-12-25_19-13-24_972_8970662548653177884-6/-ext-10000
INFO  : Launching Job 2 out of 2
INFO  : Starting task [Stage-1:MAPRED] in serial mode
INFO  : Number of reduce tasks is set to 0 since there's no reduce operator
INFO  : number of splits:2
INFO  : Submitting tokens for job: job_1513984921012_0059
INFO  : The url to track the job:
http://cdh-vm.dbaglobe.com:8088/proxy/application_1513984921012_0059/
INFO  : Starting Job = job_1513984921012_0059, Tracking URL = http://cdh-vm.dbaglobe.com:8088/proxy/application_1513984921012_0059/
INFO  : Kill Command = /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/lib/hadoop/bin/hadoop job  -kill job_1513984921012_0059
INFO  : Hadoop job information for Stage-1: number of mappers: 2; number of reducers: 0
INFO  : 2017-12-25 19:13:54,773 Stage-1 map = 0%,  reduce = 0%
INFO  : 2017-12-25 19:14:04,866 Stage-1 map = 50%,  reduce = 0%, Cumulative CPU 1.79 sec
INFO  : 2017-12-25 19:14:06,983 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 4.9 sec
INFO  : MapReduce Total cumulative CPU time: 4 seconds 900 msec
INFO  : Ended Job = job_1513984921012_0059
INFO  : MapReduce Jobs Launched:
INFO  : Stage-Stage-3: Map: 1  Reduce: 1   Cumulative CPU: 3.47 sec   HDFS Read: 11432 HDFS Write: 88 SUCCESS
INFO  : Stage-Stage-1: Map: 2   Cumulative CPU: 4.9 sec   HDFS Read: 11938 HDFS Write: 43 SUCCESS
INFO  : Total MapReduce CPU Time Spent: 8 seconds 370 msec
INFO  : Completed executing command(queryId=hive_20171225191313_3bbd504c-a4ec-4cf8-8427-fea0f832f785); Time taken: 43.095 seconds
INFO  : OK
monthly_taxi_fleet5.month,monthly_taxi_fleet5.company,monthly_taxi_fleet5.fleet
2005-02,Comfort,10046
2005-01,Comfort,9952
2 rows selected (43.346 seconds)

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> !outputformat vertical


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select * from default__monthly_taxi_fleet5_monthly_taxi_fleet5_company__ company_index;


company_index.company      CityCab
company_index._bucketname  hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5/000000_0
company_index._offset      62
company_index._bitmaps     [1,2,4,8589934592,1,0]

company_index.company      Comfort
company_index._bucketname  hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5/000000_0
company_index._offset      62
company_index._bitmaps     [1,2,4,8589934592,1,0]

company_index.company      Premier
company_index._bucketname  hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5/000000_0
company_index._offset      125
company_index._bitmaps     [1,2,4,8589934592,1,0]

company_index.company      TransCab
company_index._bucketname  hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5/000000_0
company_index._offset      188
company_index._bitmaps     [1,2,4,8589934592,1,0]

company_index.company      Smart
company_index._bucketname  hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5/000000_0
company_index._offset      251
company_index._bitmaps     [1,2,4,8589934592,1,0]

company_index.company      Individual Yellow- Top
company_index._bucketname  hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5/000000_0
company_index._offset      314
company_index._bitmaps     [1,2,4,8589934592,1,0]

company_index.company      YTC
company_index._bucketname  hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5/000000_0
company_index._offset      376
company_index._bitmaps     [1,2,4,8589934592,1,0]

company_index.company      SMRT
company_index._bucketname  hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5/000000_0
company_index._offset      439
company_index._bitmaps     [1,2,4,8589934592,1,0]

company_index.company      CityCab
company_index._bucketname  hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5/000000_0
company_index._offset      502
company_index._bitmaps     [1,2,4,8589934592,1,0]

company_index.company      Comfort
company_index._bucketname  hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5/000000_0
company_index._offset      565
company_index._bitmaps     [1,2,4,8589934592,1,0]

10 rows selected (0.143 seconds)

Example to drop the index:

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> drop index monthly_taxi_fleet5_company on monthly_taxi_fleet5;

Hive Partition by Examples

$
0
0

[donghua@cdh-vm ~]$  beeline -u jdbc:hive2://cdh-vm.dbaglobe.com:10000/default -n donghua
Connecting to jdbc:hive2://cdh-vm.dbaglobe.com:10000/default
Connected to: Apache Hive (version 1.1.0-cdh5.13.1)
Driver: Hive JDBC (version 1.1.0-cdh5.13.0)
Transaction isolation: TRANSACTION_REPEATABLE_READ
Beeline version 1.1.0-cdh5.13.0 by Apache Hive
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def>

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select substr(month,0,4) as year,month,company,fleet from monthly_taxi_fleet limit 10;
+-------+----------+-------------------------+--------+--+
| year  |  month   |         company         | fleet  |
+-------+----------+-------------------------+--------+--+
| 2005  | 2005-01  | Comfort                 | 9952   |
| 2005  | 2005-01  | CityCab                 | 4965   |
| 2005  | 2005-01  | SMRT                    | 2441   |
| 2005  | 2005-01  | YTC                     | 1223   |
| 2005  | 2005-01  | Individual Yellow- Top  | 696    |
| 2005  | 2005-01  | Smart                   | 320    |
| 2005  | 2005-01  | TransCab                | 560    |
| 2005  | 2005-01  | Premier                 | 370    |
| 2005  | 2005-02  | Comfort                 | 10046  |
| 2005  | 2005-02  | CityCab                 | 4968   |
+-------+----------+-------------------------+--------+--+

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> create table monthly_taxi_fleet_company_year_part
. . . . . . . . . . . . . . . . . . . . . . .> (month char(7),fleet smallint)
. . . . . . . . . . . . . . . . . . . . . . .> partitioned by (company varchar(50),year char(4))
. . . . . . . . . . . . . . . . . . . . . . .> stored as avro;


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> desc monthly_taxi_fleet_company_year_part;

+--------------------------+-----------------------+-----------------------+--+
|         col_name         |       data_type       |        comment        |
+--------------------------+-----------------------+-----------------------+--+
| month                    | char(7)               |                       |
| fleet                    | int                   |                       |
| company                  | varchar(50)           |                       |
| year                     | char(4)               |                       |
|                          | NULL                  | NULL                  |
| # Partition Information  | NULL                  | NULL                  |
| # col_name               | data_type             | comment               |
|                          | NULL                  | NULL                  |
| company                  | varchar(50)           |                       |
| year                     | char(4)               |                       |
+--------------------------+-----------------------+-----------------------+--+


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> insert into monthly_taxi_fleet_company_year_part (month,fleet,company,year)
. . . . . . . . . . . . . . . . . . . . . . .> values('2017-10',10000,'Comfort','2017');
Error: Error while compiling statement: FAILED: SemanticException 1:50 '[year, company]' in insert schema specification are not found among regular columns of default.monthly_taxi_fleet_company_year_part nor dynamic partition columns.. Error encountered near token 'year' (state=42000,code=40000)

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> insert into monthly_taxi_fleet_company_year_part
. . . . . . . . . . . . . . . . . . . . . . .> partition (company='Comfort',year='2017')
. . . . . . . . . . . . . . . . . . . . . . .> values('2017-10',10000);

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select * from monthly_taxi_fleet_company_year_part o;
+----------+----------+------------+---------+--+
| o.month  | o.fleet  | o.company  | o.year  |
+----------+----------+------------+---------+--+
| 2017-10  | 10000    | Comfort    | 2017    |
+----------+----------+------------+---------+--+
1 row selected (0.102 seconds)

[donghua@cdh-vm ~]$ hdfs dfs -ls -R /user/hive/warehouse/monthly_taxi_fleet_company_year_part
drwxrwxrwt   - donghua hive          0 2017-12-25 21:25 /user/hive/warehouse/monthly_taxi_fleet_company_year_part/company=Comfort
drwxrwxrwt   - donghua hive          0 2017-12-25 21:25 /user/hive/warehouse/monthly_taxi_fleet_company_year_part/company=Comfort/year=2017
-rwxrwxrwt   1 donghua hive        318 2017-12-25 21:25 /user/hive/warehouse/monthly_taxi_fleet_company_year_part/company=Comfort/year=2017/000000_0

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> drop table monthly_taxi_fleet_company_year_part;

-- Dynamic partition example

-- Default for current session only,
-- To make it permanently, edit properties in hive-site.xml


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> set hive.exec.dynamic.partition = true';
No rows affected (0.003 seconds)
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> set hive.exec.dynamic.partition.mode = nonstrict;
No rows affected (0.004 seconds)

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> insert into monthly_taxi_fleet_company_year_part
. . . . . . . . . . . . . . . . . . . . . . .> partition (company,year)
. . . . . . . . . . . . . . . . . . . . . . .> values('2017-10',10000,'Comfort','2017');


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> insert into monthly_taxi_fleet_company_year_part
. . . . . . . . . . . . . . . . . . . . . . .> partition (company,year)
. . . . . . . . . . . . . . . . . . . . . . .> select month,fleet,company,substr(month,0,4) as year from monthly_taxi_fleet;
INFO  : Compiling command(queryId=hive_20171225221515_3c4f0c8e-9d61-40e4-9d21-9e6c3249c230): insert into monthly_taxi_fleet_company_year_part
partition (company,year)
select month,fleet,company,substr(month,0,4) as year from monthly_taxi_fleet
INFO  : Semantic Analysis Completed
INFO  : Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:_col0, type:char(7), comment:null), FieldSchema(name:_col1, type:int, comment:null), FieldSchema(name:_col2, type:varchar(50), comment:null), FieldSchema(name:_col3, type:string, comment:null)], properties:null)
INFO  : Completed compiling command(queryId=hive_20171225221515_3c4f0c8e-9d61-40e4-9d21-9e6c3249c230); Time taken: 0.166 seconds
INFO  : Concurrency mode is disabled, not creating a lock manager
INFO  : Executing command(queryId=hive_20171225221515_3c4f0c8e-9d61-40e4-9d21-9e6c3249c230): insert into monthly_taxi_fleet_company_year_part
partition (company,year)
select month,fleet,company,substr(month,0,4) as year from monthly_taxi_fleet
INFO  : Query ID = hive_20171225221515_3c4f0c8e-9d61-40e4-9d21-9e6c3249c230
INFO  : Total jobs = 3
INFO  : Launching Job 1 out of 3
INFO  : Starting task [Stage-1:MAPRED] in serial mode
INFO  : Number of reduce tasks is set to 0 since there's no reduce operator
INFO  : number of splits:1
INFO  : Submitting tokens for job: job_1513984921012_0069
INFO  : The url to track the job:
http://cdh-vm.dbaglobe.com:8088/proxy/application_1513984921012_0069/
INFO  : Starting Job = job_1513984921012_0069, Tracking URL = http://cdh-vm.dbaglobe.com:8088/proxy/application_1513984921012_0069/
INFO  : Kill Command = /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/lib/hadoop/bin/hadoop job  -kill job_1513984921012_0069
INFO  : Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
INFO  : 2017-12-25 22:15:51,245 Stage-1 map = 0%,  reduce = 0%
INFO  : 2017-12-25 22:16:04,336 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 4.81 sec
INFO  : MapReduce Total cumulative CPU time: 4 seconds 810 msec
INFO  : Ended Job = job_1513984921012_0069
INFO  : Starting task [Stage-7:CONDITIONAL] in serial mode
INFO  : Stage-4 is selected by condition resolver.
INFO  : Stage-3 is filtered out by condition resolver.
INFO  : Stage-5 is filtered out by condition resolver.
INFO  : Starting task [Stage-4:MOVE] in serial mode
INFO  : Moving data to: hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet_company_year_part/.hive-staging_hive_2017-12-25_22-15-43_889_5842295461208605082-6/-ext-10000 from hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet_company_year_part/.hive-staging_hive_2017-12-25_22-15-43_889_5842295461208605082-6/-ext-10002
INFO  : Starting task [Stage-0:MOVE] in serial mode
INFO  : Loading data to table default.monthly_taxi_fleet_company_year_part partition (company=null, year=null) from hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet_company_year_part/.hive-staging_hive_2017-12-25_22-15-43_889_5842295461208605082-6/-ext-10000
INFO  :          Time taken for load dynamic partitions : 15475
INFO  :         Loading partition {company=CityCab, year=2005}
INFO  :         Loading partition {company=Prime, year=2008}
INFO  :         Loading partition {company=CityCab, year=2009}
INFO  :         Loading partition {company=SMRT, year=2014}
INFO  :         Loading partition {company=Individual Yellow- Top, year=2013}
INFO  :         Loading partition {company=Comfort, year=2016}
<omitted for reading clarity>
INFO  :         Loading partition {company=Comfort, year=2008}
INFO  :         Loading partition {company=TransCab, year=2007}
INFO  :         Loading partition {company=Individual Yellow- Top, year=2008}
INFO  :         Loading partition {company=Prime, year=2016}
INFO  :         Loading partition {company=Prime, year=2013}
INFO  :         Loading partition {company=TransCab, year=2008}
INFO  :         Loading partition {company=Comfort, year=2009}
INFO  :         Loading partition {company=TransCab, year=2011}
INFO  :          Time taken for adding to write entity : 3
INFO  : Starting task [Stage-2:STATS] in serial mode
INFO  : Partition default.monthly_taxi_fleet_company_year_part{company=CityCab, year=2005} stats: [numFiles=1, numRows=12, totalSize=450, rawDataSize=0]
INFO  : Partition default.monthly_taxi_fleet_company_year_part{company=CityCab, year=2006} stats: [numFiles=1, numRows=12, totalSize=450, rawDataSize=0]
INFO  : Partition default.monthly_taxi_fleet_company_year_part{company=CityCab, year=2007} stats: [numFiles=1, numRows=12, totalSize=450, rawDataSize=0]
INFO  : Partition default.monthly_taxi_fleet_company_year_part{company=CityCab, year=2008} stats: [numFiles=1, numRows=12, totalSize=450, rawDataSize=0]
INFO  : Partition default.monthly_taxi_fleet_company_year_part{company=CityCab, year=2009} stats: [numFiles=1, numRows=12, totalSize=450, rawDataSize=0]
INFO  : Partition default.monthly_taxi_fleet_company_year_part{company=CityCab, year=2010} stats: [numFiles=1, numRows=12, totalSize=450, rawDataSize=0]
INFO  : Partition default.monthly_taxi_fleet_company_year_part{company=CityCab, year=2011} stats: [numFiles=1, numRows=12, totalSize=450, rawDataSize=0]
INFO  : Partition default.monthly_taxi_fleet_company_year_part{company=CityCab, year=2012} stats: [numFiles=1, numRows=12, totalSize=450, rawDataSize=0]
INFO  : Partition default.monthly_taxi_fleet_company_year_part{company=CityCab, year=2013} stats: [numFiles=1, numRows=12, totalSize=450, rawDataSize=0]
INFO  : Partition default.monthly_taxi_fleet_company_year_part{company=CityCab, year=2014} stats: [numFiles=1, numRows=12, totalSize=450, rawDataSize=0]
INFO  : Partition default.monthly_taxi_fleet_company_year_part{company=CityCab, year=2015} stats: [numFiles=1, numRows=12, totalSize=450, rawDataSize=0]
INFO  : Partition default.monthly_taxi_fleet_company_year_part{company=CityCab, year=2016} stats: [numFiles=1, numRows=12, totalSize=450, rawDataSize=0]
INFO  : Partition default.monthly_taxi_fleet_company_year_part{company=Comfort, year=2005} stats: [numFiles=1, numRows=12, totalSize=462, rawDataSize=0]
INFO  : Partition default.monthly_taxi_fleet_company_year_part{company=Comfort, year=2006} stats: [numFiles=1, numRows=12, totalSize=462, rawDataSize=0]

<omitted for reading clarity>

numRows=12, totalSize=450, rawDataSize=0]
INFO  : Partition default.monthly_taxi_fleet_company_year_part{company=YTC, year=2005} stats: [numFiles=1, numRows=12, totalSize=450, rawDataSize=0]
INFO  : Partition default.monthly_taxi_fleet_company_year_part{company=YTC, year=2006} stats: [numFiles=1, numRows=12, totalSize=450, rawDataSize=0]
INFO  : Partition default.monthly_taxi_fleet_company_year_part{company=YTC, year=2007} stats: [numFiles=1, numRows=12, totalSize=447, rawDataSize=0]
INFO  : MapReduce Jobs Launched:
INFO  : Stage-Stage-1: Map: 1   Cumulative CPU: 4.81 sec   HDFS Read: 31633 HDFS Write: 50547 SUCCESS
INFO  : Total MapReduce CPU Time Spent: 4 seconds 810 msec
INFO  : Completed executing command(queryId=hive_20171225221515_3c4f0c8e-9d61-40e4-9d21-9e6c3249c230); Time taken: 40.317 seconds
INFO  : OK
No rows affected (40.505 seconds)
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def>


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> show partitions monthly_taxi_fleet_company_year_part;
+-------------------------------------------+--+
|                 partition                 |
+-------------------------------------------+--+
| company=CityCab/year=2005                 |
| company=CityCab/year=2006                 |
| company=CityCab/year=2007                 |
| company=CityCab/year=2008                 |
| company=CityCab/year=2009                 |
| company=CityCab/year=2010                 |
| company=CityCab/year=2011                 |
| company=CityCab/year=2012                 |
| company=CityCab/year=2013                 |
| company=CityCab/year=2014                 |
<omitted for reading clarity>
| company=TransCab/year=2012                |
| company=TransCab/year=2013                |
| company=TransCab/year=2014                |
| company=TransCab/year=2015                |
| company=TransCab/year=2016                |
| company=YTC/year=2005                     |
| company=YTC/year=2006                     |
| company=YTC/year=2007                     |
+-------------------------------------------+--+
94 rows selected (0.093 seconds)
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def>

[donghua@cdh-vm ~]$ hdfs dfs -ls -R /user/hive/warehouse/monthly_taxi_fleet_company_year_part
drwxrwxrwt   - donghua hive          0 2017-12-25 22:16 /user/hive/warehouse/monthly_taxi_fleet_company_year_part/company=CityCab
drwxrwxrwt   - donghua hive          0 2017-12-25 22:16 /user/hive/warehouse/monthly_taxi_fleet_company_year_part/company=CityCab/year=2005
-rwxrwxrwt   1 donghua hive        450 2017-12-25 22:16 /user/hive/warehouse/monthly_taxi_fleet_company_year_part/company=CityCab/year=2005/000000_0
drwxrwxrwt   - donghua hive          0 2017-12-25

<omitted for reading clarity>
-rwxrwxrwt   1 donghua hive        450 2017-12-25 22:15 /user/hive/warehouse/monthly_taxi_fleet_company_year_part/company=TransCab/year=2016/000000_0
drwxrwxrwt   - donghua hive          0 2017-12-25 22:16 /user/hive/warehouse/monthly_taxi_fleet_company_year_part/company=YTC
drwxrwxrwt   - donghua hive          0 2017-12-25 22:16 /user/hive/warehouse/monthly_taxi_fleet_company_year_part/company=YTC/year=2005
-rwxrwxrwt   1 donghua hive        450 2017-12-25 22:16 /user/hive/warehouse/monthly_taxi_fleet_company_year_part/company=YTC/year=2005/000000_0
drwxrwxrwt   - donghua hive          0 2017-12-25 22:16 /user/hive/warehouse/monthly_taxi_fleet_company_year_part/company=YTC/year=2006
-rwxrwxrwt   1 donghua hive        450 2017-12-25 22:16 /user/hive/warehouse/monthly_taxi_fleet_company_year_part/company=YTC/year=2006/000000_0
drwxrwxrwt   - donghua hive          0 2017-12-25 22:16 /user/hive/warehouse/monthly_taxi_fleet_company_year_part/company=YTC/year=2007
-rwxrwxrwt   1 donghua hive        447 2017-12-25 22:16 /user/hive/warehouse/monthly_taxi_fleet_company_year_part/company=YTC/year=2007/000000_0

Possible Errors & Solutions:

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> insert into monthly_taxi_fleet_company_year_part
. . . . . . . . . . . . . . . . . . . . . . .> partition (company,year)
. . . . . . . . . . . . . . . . . . . . . . .> select month,fleet,company,substr(month,0,4) as year from monthly_taxi_fleet;
INFO  : Compiling command(queryId=hive_20171225214545_54b5b6f8-8a03-451a-929f-fb175747a0ec): insert into monthly_taxi_fleet_company_year_part
partition (company,year)
select month,fleet,company,substr(month,0,4) as year from monthly_taxi_fleet
INFO  : Semantic Analysis Completed
INFO  : Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:_col0, type:char(7), comment:null), FieldSchema(name:_col1, type:int, comment:null), FieldSchema(name:_col2, type:varchar(50), comment:null), FieldSchema(name:_col3, type:string, comment:null)], properties:null)
INFO  : Completed compiling command(queryId=hive_20171225214545_54b5b6f8-8a03-451a-929f-fb175747a0ec); Time taken: 0.166 seconds
INFO  : Concurrency mode is disabled, not creating a lock manager
INFO  : Executing command(queryId=hive_20171225214545_54b5b6f8-8a03-451a-929f-fb175747a0ec): insert into monthly_taxi_fleet_company_year_part
partition (company,year)
select month,fleet,company,substr(month,0,4) as year from monthly_taxi_fleet
INFO  : Query ID = hive_20171225214545_54b5b6f8-8a03-451a-929f-fb175747a0ec
INFO  : Total jobs = 3
INFO  : Launching Job 1 out of 3
INFO  : Starting task [Stage-1:MAPRED] in serial mode
INFO  : Number of reduce tasks is set to 0 since there's no reduce operator
INFO  : number of splits:1
INFO  : Submitting tokens for job: job_1513984921012_0065
INFO  : The url to track the job:
http://cdh-vm.dbaglobe.com:8088/proxy/application_1513984921012_0065/
INFO  : Starting Job = job_1513984921012_0065, Tracking URL = http://cdh-vm.dbaglobe.com:8088/proxy/application_1513984921012_0065/
INFO  : Kill Command = /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/lib/hadoop/bin/hadoop job  -kill job_1513984921012_0065
INFO  : Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
INFO  : 2017-12-25 21:45:12,194 Stage-1 map = 0%,  reduce = 0%
INFO  : 2017-12-25 21:46:12,857 Stage-1 map = 0%,  reduce = 0%
INFO  : 2017-12-25 21:46:19,235 Stage-1 map = 100%,  reduce = 0%
ERROR : Ended Job = job_1513984921012_0065 with errors
ERROR : FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.mr.MapRedTask
INFO  : MapReduce Jobs Launched:
INFO  : Stage-Stage-1: Map: 1   HDFS Read: 0 HDFS Write: 0 FAIL
INFO  : Total MapReduce CPU Time Spent: 0 msec
INFO  : Completed executing command(queryId=hive_20171225214545_54b5b6f8-8a03-451a-929f-fb175747a0ec); Time taken: 76.105 seconds
Error: Error while processing statement: FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.mr.MapRedTask (state=08S01,code=2)

2017-12-25 21:45:24,056 FATAL [IPC Server handler 17 on 44101] org.apache.hadoop.mapred.TaskAttemptListenerImpl: Task: attempt_1513984921012_0065_m_000000_0 - exited : java.lang.RuntimeException: org.apache.hadoop.hive.ql.metadata.HiveException: Hive Runtime Error while processing row {"month":"2006-02","company":"YTC","fleet":876}
    at org.apache.hadoop.hive.ql.exec.mr.ExecMapper.map(ExecMapper.java:179)
    at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:54)
    at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:459)
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:343)
    at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:422)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1917)
    at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Hive Runtime Error while processing row {"month":"2006-02","company":"YTC","fleet":876}
    at org.apache.hadoop.hive.ql.exec.MapOperator.process(MapOperator.java:507)
    at org.apache.hadoop.hive.ql.exec.mr.ExecMapper.map(ExecMapper.java:170)
    ... 8 more
Caused by: org.apache.hadoop.hive.ql.metadata.HiveFatalException: [Error 20004]: Fatal error occurred when node tried to create too many dynamic partitions. The maximum number of dynamic partitions is controlled by hive.exec.max.dynamic.partitions and hive.exec.max.dynamic.partitions.pernode. Maximum was set to: 100
    at org.apache.hadoop.hive.ql.exec.FileSinkOperator.getDynOutPaths(FileSinkOperator.java:897)
    at org.apache.hadoop.hive.ql.exec.FileSinkOperator.processOp(FileSinkOperator.java:677)
    at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:815)
    at org.apache.hadoop.hive.ql.exec.SelectOperator.processOp(SelectOperator.java:84)
    at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:815)
    at org.apache.hadoop.hive.ql.exec.TableScanOperator.processOp(TableScanOperator.java:98)
    at org.apache.hadoop.hive.ql.exec.MapOperator$MapOpCtx.forward(MapOperator.java:157)
    at org.apache.hadoop.hive.ql.exec.MapOperator.process(MapOperator.java:497)
    ... 9 more

   
1 row selected (44.065 seconds)
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> set hive.exec.max.dynamic.partition=1000;
No rows affected (0.006 seconds)
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> set hive.exec.max.dynamic.partitions.pernode=1000;
No rows affected (0.002 seconds)

Hive Bucketing with examples

$
0
0

Think it as HASH based indexes in RDBMS, more suitable for high cardinanity data columns (e.g.: customer_id, product_id, station_id, etc)

Basic Bucket example:


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> create table monthly_taxi_fleet6
. . . . . . . . . . . . . . . . . . . . . . .> (month char(7),fleet smallint,company varchar(50))
. . . . . . . . . . . . . . . . . . . . . . .> clustered by (company) into 3 buckets
. . . . . . . . . . . . . . . . . . . . . . .> stored as avro;

Example using Apache Hive version 1.1.0-cdh5.13.1, hive.enforce.bucketing=false by default
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> insert into monthly_taxi_fleet6
. . . . . . . . . . . . . . . . . . . . . . .> select month,fleet,company from monthly_taxi_fleet;

[donghua@cdh-vm ~]$ hdfs dfs -ls -R /user/hive/warehouse/monthly_taxi_fleet6
-rwxrwxrwt   1 donghua hive      25483 2017-12-26 10:40 /user/hive/warehouse/monthly_taxi_fleet6/000000_0

-- hive.enforce.bucketing: Whether bucketing is enforced. If true, while inserting into the table, bucketing is enforced.
-- Default Value: Hive 0.x: false, Hive 1.x: false, Hive 2.x: removed, which effectively makes it always true (HIVE-12331)

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> set hive.enforce.bucketing=true;

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> insert into monthly_taxi_fleet6
. . . . . . . . . . . . . . . . . . . . . . .> select month,fleet,company from monthly_taxi_fleet;

[donghua@cdh-vm ~]$ hdfs dfs -ls -R /user/hive/warehouse/monthly_taxi_fleet6
-rwxrwxrwt   1 donghua hive      13611 2017-12-26 10:43 /user/hive/warehouse/monthly_taxi_fleet6/000000_0
-rwxrwxrwt   1 donghua hive       6077 2017-12-26 10:43 /user/hive/warehouse/monthly_taxi_fleet6/000001_0
-rwxrwxrwt   1 donghua hive       6589 2017-12-26 10:43 /user/hive/warehouse/monthly_taxi_fleet6/000002_0

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> describe extended monthly_taxi_fleet6;
+-----------------------------+----------------------------------------------------+----------+--+
|          col_name           |                     data_type                      | comment  |
+-----------------------------+----------------------------------------------------+----------+--+
| month                       | char(7)                                            |          |
| fleet                       | int                                                |          |
| company                     | varchar(50)                                        |          |
|                             | NULL                                               | NULL     |
| Detailed Table Information  | Table(tableName:monthly_taxi_fleet6, dbName:default, owner:donghua, createTime:1514256031, lastAccessTime:0, retention:0, sd:StorageDescriptor(cols:[FieldSchema(name:month, type:char(7), comment:null), FieldSchema(name:fleet, type:smallint, comment:null), FieldSchema(name:company, type:varchar(50), comment:null)], location:hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet6, inputFormat:org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat, outputFormat:org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat, compressed:false, numBuckets:3, serdeInfo:SerDeInfo(name:null, serializationLib:org.apache.hadoop.hive.serde2.avro.AvroSerDe, parameters:{serialization.format=1}), bucketCols:[company], sortCols:[], parameters:{}, skewedInfo:SkewedInfo(skewedColNames:[], skewedColValues:[], skewedColValueLocationMaps:{}), storedAsSubDirectories:false), partitionKeys:[], parameters:{totalSize=26277, numRows=1128, rawDataSize=0, COLUMN_STATS_ACCURATE=true, numFiles=3, transient_lastDdlTime=1514256192}, viewOriginalText:null, viewExpandedText:null, tableType:MANAGED_TABLE) |          |
+-----------------------------+----------------------------------------------------+----------+--+
5 rows selected (0.075 seconds)

Advanced Bucket example: Partition + Bucketing + Sorted by


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> create table monthly_taxi_fleet7
. . . . . . . . . . . . . . . . . . . . . . .> (month char(7),fleet smallint)
. . . . . . . . . . . . . . . . . . . . . . .> partitioned by (company varchar(50))
. . . . . . . . . . . . . . . . . . . . . . .> clustered by (month) sorted by (month)into 3 buckets
. . . . . . . . . . . . . . . . . . . . . . .> stored as avro;

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> insert into monthly_taxi_fleet7
. . . . . . . . . . . . . . . . . . . . . . .> partition (company)
. . . . . . . . . . . . . . . . . . . . . . .> select month,fleet,company from monthly_taxi_fleet;

[donghua@cdh-vm ~]$ hdfs dfs -ls -R /user/hive/warehouse/monthly_taxi_fleet7
drwxrwxrwt   - donghua hive          0 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=CityCab
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=CityCab/000000_0
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=CityCab/000001_0
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=CityCab/000002_0

drwxrwxrwt   - donghua hive          0 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Comfort
-rwxrwxrwt   1 donghua hive        913 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Comfort/000000_0
-rwxrwxrwt   1 donghua hive        913 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Comfort/000001_0
-rwxrwxrwt   1 donghua hive        913 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Comfort/000002_0
drwxrwxrwt   - donghua hive          0 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Individual Yellow- Top
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Individual Yellow- Top/000000_0
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Individual Yellow- Top/000001_0
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Individual Yellow- Top/000002_0
drwxrwxrwt   - donghua hive          0 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Premier
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Premier/000000_0
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Premier/000001_0
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Premier/000002_0
drwxrwxrwt   - donghua hive          0 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Prime
-rwxrwxrwt   1 donghua hive        765 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Prime/000000_0
-rwxrwxrwt   1 donghua hive        765 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Prime/000001_0
-rwxrwxrwt   1 donghua hive        766 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Prime/000002_0
drwxrwxrwt   - donghua hive          0 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=SMRT
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=SMRT/000000_0
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=SMRT/000001_0
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=SMRT/000002_0
drwxrwxrwt   - donghua hive          0 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Smart
-rwxrwxrwt   1 donghua hive        720 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Smart/000000_0
-rwxrwxrwt   1 donghua hive        719 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Smart/000001_0
-rwxrwxrwt   1 donghua hive        719 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Smart/000002_0
drwxrwxrwt   - donghua hive          0 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=TransCab
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=TransCab/000000_0
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=TransCab/000001_0
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=TransCab/000002_0
drwxrwxrwt   - donghua hive          0 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=YTC
-rwxrwxrwt   1 donghua hive        432 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=YTC/000000_0
-rwxrwxrwt   1 donghua hive        432 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=YTC/000001_0
-rwxrwxrwt   1 donghua hive        432 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=YTC/000002_0

Hive table sampling explained with examples

$
0
0

-- Leverage prebuild buckets
select * from monthly_taxi_fleet6 tablesample(bucket 1 out of 3 on month);

-- Leverage prebuild buckets, split it from 3 buckets into 10 buckets dynamically)
select * from monthly_taxi_fleet6 tablesample(bucket 1 out of 10 on month);

-- Dynamically build bucket on company column
select * from monthly_taxi_fleet6 tablesample(bucket 1 out of 3 on company);

-- block based sampling
select * from monthly_taxi_fleet6 tablesample(5 percent);

-- block based sampling, limit input by storage size
select * from monthly_taxi_fleet6 tablesample(5M);

-- row based sampling, limiting input by row count basis
select * from monthly_taxi_fleet6 tablesample(10 rows);


Alternative way to disable transparent hugepage (THP) on Red Hat Enterprise Linux 7

$
0
0

Append “transparent_hugepage=never” to GRUB_CMDLINE_LINUX in /etc/default/grub

[root@hdp-vm ~]# vi /etc/default/grub
GRUB_TIMEOUT=5
GRUB_DISTRIBUTOR="$(sed 's, release .*$,,g' /etc/system-release)"
GRUB_DEFAULT=saved
GRUB_DISABLE_SUBMENU=true
GRUB_TERMINAL_OUTPUT="console"
GRUB_CMDLINE_LINUX="rd.lvm.lv=centos/root rd.lvm.lv=centos/swap rhgb quiet transparent_hugepage=never"
GRUB_DISABLE_RECOVERY="true"

[root@hdp-vm ~]# grub2-mkconfig -o /boot/grub2/grub.cfg
Generating grub configuration file ...
Found linux image: /boot/vmlinuz-3.10.0-693.11.1.el7.x86_64
Found initrd image: /boot/initramfs-3.10.0-693.11.1.el7.x86_64.img
Found linux image: /boot/vmlinuz-3.10.0-693.el7.x86_64
Found initrd image: /boot/initramfs-3.10.0-693.el7.x86_64.img
Found linux image: /boot/vmlinuz-0-rescue-8be2b63ef43643f786bd865127a5a3bb
Found initrd image: /boot/initramfs-0-rescue-8be2b63ef43643f786bd865127a5a3bb.img

[root@hdp-vm ~]# reboot

[root@hdp-vm ~]# cat /sys/kernel/mm/transparent_hugepage/enabled
always madvise [never]


[root@hdp-vm ~]# cat /proc/cmdline
BOOT_IMAGE=/vmlinuz-3.10.0-693.11.1.el7.x86_64 root=/dev/mapper/centos-root ro rd.lvm.lv=centos/root rd.lvm.lv=centos/swap rhgb quiet transparent_hugepage=never

Hive Transform using python script example

$
0
0

Python Script: 

[donghua@cdh-vm ~]$ cat employees.py
  import sys
  for line in sys.stdin:
   (employeeid,firstname,lastname) = line.split('\t')
   # print function in python2 will introduce newline
   sys.stdout.write(employeeid+'\t'+firstname+','+lastname)
  
Hive Script: 

create table employees (employee_id int,first_name string,last_name string) stored as avro;
insert into employees values(1,'donghua','luo'),(2,'larry','elison'),(3,'tom','kyte');

add file /tmp/employees.py;
select transform(employee_id,first_name,last_name) using 'python employees.py' as (employee_id,full_name) from employees;

Sample output:

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/d> select transform(employee_id,first_name,last_name) using 'python employees.py' as (employee_id,full_name) from employees;
  +--------------+---------------+--+
  | employee_id  |   full_name   |
  +--------------+---------------+--+
  | 1            | donghua,luo   |
  | 2            | larry,elison  |
  | 3            | tom,kyte      |
  +--------------+---------------+--+
  3 rows selected (17.997 seconds)

Permission issue after Sentry enabled for HDFS/HIVE/Impala/Hue

$
0
0

Original few tables created under user donghua, now zero table showed through “show tables” command:

0: jdbc:hive2://cdh-vm. dbaglobe.com:10000/d> show tables;
+-----------+--+
| tab_name  |
+-----------+--+
+-----------+--+
No rows selected (0.386 seconds)

Permission denied for create table:

0: jdbc:hive2://cdh-vm. dbaglobe.com:10000/d> create table employee3 (id int, name string);
Error: Error while compiling statement: FAILED: SemanticException No valid privileges
  User donghua does not have privileges for CREATETABLE
  The required privileges: Server=server1->Db=default->action=*; (state=42000,code=40000)


Quick solution:

login hive as user hive, and grant all permission to donghua; (refer to URL here if help needed to login as hive after kerboros enabled: http://www.dbaglobe.com/2017/12/login-as-keberos-userprincipal-after.html)

create role analyst_role;
grant all on database default to role analyst_role;
grant role analyst_role to donghua;

Kudu & Impalad flag file configuration

$
0
0

[root@cdh-vm donghua]# ps -ef|egrep 'kudu-|impalad'
kudu      4466  1221  0 Dec29 ?        00:01:10 /opt/cloudera/parcels/CDH-5.13.0-1.cdh5.13.0.p0.29/lib/kudu/sbin/kudu-tserver --tserver_master_addrs=cdh-vm.dbaglobe.com --flagfile=/run/cloudera-scm-agent/process/96-kudu-KUDU_TSERVER/gflagfile
kudu      4468  1221  0 Dec29 ?        00:01:15 /opt/cloudera/parcels/CDH-5.13.0-1.cdh5.13.0.p0.29/lib/kudu/sbin/kudu-master --flagfile=/run/cloudera-scm-agent/process/97-kudu-KUDU_MASTER/gflagfile
impala   11401  1221  0 Dec29 ?        00:02:12 /opt/cloudera/parcels/CDH-5.13.0-1.cdh5.13.0.p0.29/lib/impala/sbin-retail/impalad --flagfile=/run/cloudera-scm-agent/process/110-impala-IMPALAD/impala-conf/impalad_flags

[root@cdh-vm donghua]# cat /run/cloudera-scm-agent/process/97-kudu-KUDU_MASTER/gflagfile
-default_num_replicas=3
-fs_data_dirs=/dfs/kmd
-fs_wal_dir=/dfs/kmw
-log_dir=/var/log/kudu
-log_force_fsync_all=false
-logbuflevel=0
-max_log_size=1800
-minloglevel=0
-superuser_acl
-user_acl=*
-v=0
-webserver_certificate_file
-webserver_port=8051
-webserver_private_key_file
-webserver_private_key_password_cmd

[root@cdh-vm donghua]# cat /run/cloudera-scm-agent/process/96-kudu-KUDU_TSERVER/gflagfile
-block_cache_capacity_mb=512
-fs_data_dirs=/dfs/ktd
-fs_wal_dir=/dfs/ktw
-log_dir=/var/log/kudu
-log_force_fsync_all=false
-logbuflevel=0
-maintenance_manager_num_threads=1
-max_log_size=1800
-memory_limit_hard_bytes=1073741824
-minloglevel=0
-superuser_acl
-user_acl=*
-v=0
-webserver_certificate_file
-webserver_port=8050
-webserver_private_key_file
-webserver_private_key_password_cmd


[root@cdh-vm donghua]# cat /run/cloudera-scm-agent/process/110-impala-IMPALAD/impala-conf/impalad_flags
-beeswax_port=21000
-fe_port=21000
-be_port=22000
-llama_callback_port=28000
-hs2_port=21050
-enable_webserver=true
-mem_limit=268435456
-max_log_files=10
-webserver_port=25000
-max_result_cache_size=100000
-state_store_subscriber_port=23000
-statestore_subscriber_timeout_seconds=30
-scratch_dirs=/impala/impalad
-default_query_options
-load_auth_to_local_rules=false
-kerberos_reinit_interval=60
-principal=impala/cdh-vm.dbaglobe.com@DBAGLOBE.COM
-keytab_file=/run/cloudera-scm-agent/process/110-impala-IMPALAD/impala.keytab
-log_filename=impalad
-audit_event_log_dir=/var/log/impalad/audit
-max_audit_event_log_file_size=5000
-abort_on_failed_audit_event=false
-minidump_path=/var/log/impala-minidumps
-max_minidumps=9
-lineage_event_log_dir=/var/log/impalad/lineage
-max_lineage_log_file_size=5000
-hostname=cdh-vm.dbaglobe.com
-state_store_host=cdh-vm.dbaglobe.com
-enable_rm=false
-state_store_port=24000
-catalog_service_host=cdh-vm.dbaglobe.com
-catalog_service_port=26000
-local_library_dir=/var/lib/impala/udfs
-fair_scheduler_allocation_path=/run/cloudera-scm-agent/process/110-impala-IMPALAD/impala-conf/fair-scheduler.xml
-llama_site_path=/run/cloudera-scm-agent/process/110-impala-IMPALAD/impala-conf/llama-site.xml
-disable_admission_control=false
-queue_wait_timeout_ms=60000
-disk_spill_encryption=false
-abort_on_config_error=true
-kudu_master_hosts=cdh-vm.dbaglobe.com

Apache Kudu DML example (kudu 1.5.0-cdh5.13.1)

$
0
0

[donghua@cdh-vm ~]$ impala-shell -i cdh-vm.dbaglobe.com -k
Starting Impala Shell using Kerberos authentication
Using service name 'impala'
Connected to cdh-vm.dbaglobe.com:21000
Server version: impalad version 2.10.0-cdh5.13.1 RELEASE (build 1e4b23c4eb52dac95c5be6316f49685c41783c51)
***********************************************************************************
Welcome to the Impala shell.
(Impala Shell v2.10.0-cdh5.13.1 (1e4b23c) built on Thu Nov  9 08:29:47 PST 2017)

To see a summary of a query's progress that updates in real-time, run 'set
LIVE_PROGRESS=1;'.
**********************************************************************************

[cdh-vm.dbaglobe.com:21000] > create table employees(id int, name string) stored as kudu;
Query: create table employees(id int, name string) stored as kudu
ERROR: AnalysisException: A primary key is required for a Kudu table.

[cdh-vm.dbaglobe.com:21000] > create table employees(id int, name string, primary key (id)) stored as kudu;
Query: create table employees(id int, name string, primary key (id)) stored as kudu
WARNINGS: Unpartitioned Kudu tables are inefficient for large data sizes.

Fetched 0 row(s) in 0.41s
[cdh-vm.dbaglobe.com:21000] > drop table employees;
Query: drop table employees

[cdh-vm.dbaglobe.com:21000] > create table employees(id int, name string, primary key (id)) partition by hash partitions 3 stored as kudu;
Query: create table employees(id int, name string, primary key (id)) partition by hash partitions 3 stored as kudu
Fetched 0 row(s) in 0.15s

[cdh-vm.dbaglobe.com:21000] > insert into employees values (1,'donghua');
Query: insert into employees values (1,'donghua')
Query submitted at: 2017-12-30 07:22:56 (Coordinator:
http://cdh-vm.dbaglobe.com:25000)
Query progress can be monitored at: http://cdh-vm.dbaglobe.com:25000/query_plan?query_id=724c4c67c59d5eb6:9c74075700000000
Modified 1 row(s), 0 row error(s) in 4.28s

[cdh-vm.dbaglobe.com:21000] > select * from employees;
Query: select * from employees
Query submitted at: 2017-12-30 07:23:12 (Coordinator:
http://cdh-vm.dbaglobe.com:25000)
Query progress can be monitored at: http://cdh-vm.dbaglobe.com:25000/query_plan?query_id=bb462f142f62e12b:385e2ce900000000
+----+---------+
| id | name    |
+----+---------+
| 1  | donghua |
+----+---------+
Fetched 1 row(s) in 0.16s

[cdh-vm.dbaglobe.com:21000] > insert into employees values (2,'larry');
Query: insert into employees values (2,'larry')
Query submitted at: 2017-12-30 07:23:21 (Coordinator:
http://cdh-vm.dbaglobe.com:25000)
Query progress can be monitored at: http://cdh-vm.dbaglobe.com:25000/query_plan?query_id=1a4767310c5a9b99:2c2a26b400000000
Modified 1 row(s), 0 row error(s) in 0.11s

[cdh-vm.dbaglobe.com:21000] > select * from employees;
Query: select * from employees
Query submitted at: 2017-12-30 07:23:26 (Coordinator:
http://cdh-vm.dbaglobe.com:25000)
Query progress can be monitored at: http://cdh-vm.dbaglobe.com:25000/query_plan?query_id=7d4b866c73311bd9:5374e5ad00000000
+----+---------+
| id | name    |
+----+---------+
| 2  | larry   |
| 1  | donghua |
+----+---------+
Fetched 2 row(s) in 0.16s

[cdh-vm.dbaglobe.com:21000] > update employees set id=3 where id=1;
Query: update employees set id=3 where id=1
Query submitted at: 2017-12-30 07:23:44 (Coordinator:
http://cdh-vm.dbaglobe.com:25000)
ERROR: AnalysisException: Key column 'id' cannot be updated.

[cdh-vm.dbaglobe.com:21000] > update employees set name='tom' where id=2;
Query: update employees set name='tom' where id=2
Query submitted at: 2017-12-30 07:23:58 (Coordinator:
http://cdh-vm.dbaglobe.com:25000)
Query progress can be monitored at: http://cdh-vm.dbaglobe.com:25000/query_plan?query_id=644fe7f97c2c5221:bc7730eb00000000
Modified 1 row(s), 0 row error(s) in 0.18s

[cdh-vm.dbaglobe.com:21000] > delete from employees where id=1;
Query: delete from employees where id=1
Query submitted at: 2017-12-30 07:24:11 (Coordinator:
http://cdh-vm.dbaglobe.com:25000)
Query progress can be monitored at: http://cdh-vm.dbaglobe.com:25000/query_plan?query_id=3048cecbb9e1c886:7686422c00000000
Modified 1 row(s), 0 row error(s) in 0.13s

[cdh-vm.dbaglobe.com:21000] > select * from employees;
Query: select * from employees
Query submitted at: 2017-12-30 07:24:16 (Coordinator:
http://cdh-vm.dbaglobe.com:25000)
Query progress can be monitored at: http://cdh-vm.dbaglobe.com:25000/query_plan?query_id=7244597f3717fcd0:5c81509d00000000
+----+------+
| id | name |
+----+------+
| 2  | tom  |
+----+------+
Fetched 1 row(s) in 0.14s
[cdh-vm.dbaglobe.com:21000] > exit;
Goodbye donghua

Kudu “distribute by” syntax error in Impala

$
0
0

Below example follows syntax in various Cloudera Kudu training (kudu v1.0) and documentations/blogs, with syntax error. (replace “distribute by” to equivalent syntax of “partition by” doesn’t help)

[donghua@cdh-vm ~]$ impala-shell -i cdh-vm.dbaglobe.com -k
Starting Impala Shell using Kerberos authentication
Using service name 'impala'
Connected to cdh-vm.dbaglobe.com:21000
Server version: impalad version 2.10.0-cdh5.13.1 RELEASE (build 1e4b23c4eb52dac95c5be6316f49685c41783c51)
***********************************************************************************
Welcome to the Impala shell.


(Impala Shell v2.10.0-cdh5.13.1 (1e4b23c) built on Thu Nov  9 08:29:47 PST 2017)


[cdh-vm.dbaglobe.com:21000] > create table kudu_iotdatademo3
                            > distribute by hash (eventts) into 3 buckets
                             > tblproperties (
                            > 'storage_handler'='com.cloudera.kudu.hive.KuduStorageHandler',
                             > 'kudu_tablename'='kudu_iotdatademo2',
                            > 'kudu_master_addresses'='cdh-vm.dbaglobe.com:8051',
                            > 'kudu_key_columns'='stationid,eventts')
                            > as select * from iotdatademo2;
Query: create table kudu_iotdatademo3
distribute by hash (eventts) into 3 buckets
tblproperties (
'storage_handler'='com.cloudera.kudu.hive.KuduStorageHandler',
'kudu_tablename'='kudu_iotdatademo2',
'kudu_master_addresses'='cdh-vm.dbaglobe.com:8051',
'kudu_key_columns'='stationid,eventts')
as select * from iotdatademo2
Query submitted at: 2017-12-30 07:17:43 (Coordinator:
http://cdh-vm.dbaglobe.com:25000)
ERROR: AnalysisException: Syntax error in line 2:
distribute by hash (eventts) into 3 buckets
^
Encountered: IDENTIFIER
Expected: ADD, ALTER, AS, CACHED, CHANGE, COMMENT, DROP, FROM, LIKE, LOCATION, PARTITION, PARTITIONED, PRIMARY, PURGE, RECOVER, RENAME, REPLACE, ROW, SELECT, SET, SORT, STORED, STRAIGHT_JOIN, TBLPROPERTIES, TO, UNCACHED, VALUES, WITH

CAUSED BY: Exception: Syntax error

Workaround: (Tested in kudu 1.5 + impala 2.1 in CDH 5.13.1)


[cdh-vm.dbaglobe.com:21000] > create table kudu_iotdatademo2
                            > (stationid int,
                             > eventts timestamp,
                            > eventdate int,
                            > eventday tinyint,
                            > speed float,
                             > volume int,
                            > primary key (stationid,eventts))
                            > partition by hash partitions 3
                            > stored as kudu;
Query: create table kudu_iotdatademo2
(stationid int,
eventts timestamp,
eventdate int,
eventday tinyint,
speed float,
volume int,
primary key (stationid,eventts))
partition by hash partitions 3
stored as kudu
Fetched 0 row(s) in 1.15s

[cdh-vm.dbaglobe.com:21000] > insert into kudu_iotdatademo2
                             > select stationid,eventts,eventdate,eventday,speed,volume from iotdatademo2;
Query: insert into kudu_iotdatademo2
select stationid,eventts,eventdate,eventday,speed,volume from iotdatademo2
Query submitted at: 2017-12-30 07:18:56 (Coordinator:
http://cdh-vm.dbaglobe.com:25000)
Query progress can be monitored at: http://cdh-vm.dbaglobe.com:25000/query_plan?query_id=a4acf75a7302750:8317ee4000000000
Modified 3456001 row(s), 0 row error(s) in 31.43s
[cdh-vm.dbaglobe.com:21000] >

Common errors and fixes with Spark 1.6 running with Python3 (Anaconda Version)

$
0
0

1. Error caused by Python 3.6 (version too new)

[donghua@cdh-vm spark]$ pyspark
WARNING: User-defined SPARK_HOME (/opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/lib/spark) overrides detected (/opt/cloudera/parcels/CDH/lib/spark).
WARNING: Running pyspark from user-defined location.
Python 3.6.3 |Anaconda, Inc.| (default, Oct 13 2017, 12:02:49)
Type 'copyright', 'credits' or 'license' for more information
IPython 6.1.0 -- An enhanced Interactive Python. Type '?' for help.
[TerminalIPythonApp] WARNING | Unknown error in handling PYTHONSTARTUP file /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/lib/spark/python/pyspark/shell.py:


TypeError: namedtuple() missing 3 required keyword-only arguments: 'verbose', 'rename', and 'module'

How to fix:

[donghua@cdh-vm spark]$ conda create -n py35 python=3.5 anaconda

[donghua@cdh-vm spark]$ source activate py35

2. Error caused by worker using different version comparing to pyspark

(py35) [donghua@cdh-vm ~]$ pyspark
WARNING: User-defined SPARK_HOME (/opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/lib/spark) overrides detected (/opt/cloudera/parcels/CDH/lib/spark).
WARNING: Running pyspark from user-defined location.
Python 3.5.4 |Anaconda, Inc.| (default, Oct 13 2017, 11:22:58)
Type 'copyright', 'credits' or 'license' for more information
IPython 6.1.0 -- An enhanced Interactive Python. Type '?' for help.
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel).
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 1.6.0
      /_/

Using Python version 3.5.4 (default, Oct 13 2017 11:22:58)
SparkContext available as sc, HiveContext available as sqlContext.


In [9]: sc.textFile('/user/donghua/IOTDataDemo.csv').filter(lambda line: line[0:9] != "StationID").map(lambda line: (line.split(",")[3],(float(line.split(",")[4]),1))).reduceByK
   ...: ey(lambda a,b: (a[0]+b[0],a[1]+b[1])).mapValues(lambda v: v[0]/v[1]).sortByKey()
[Stage 0:>                                                          (0 + 2) / 2]

18/01/03 08:22:00 WARN scheduler.TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, cdh-vm.dbaglobe.com, executor 1): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/lib/spark/python/pyspark/worker.py", line 64, in main
    ("%d.%d" % sys.version_info[:2], version))
Exception: Python in worker has different version 2.7 than that in driver 3.5, PySpark cannot run with different minor versions

How to fix:

Add line “PYSPARK_PYTHON=/opt/anaconda3/envs/py35/bin/python3” to file “/opt/cloudera/parcels/CDH/lib/spark/conf/spark-env.sh “

3. Error “Randomness of hash of string should be disabled via PYTHONHASHSEED”

In [1]: sc.textFile('/user/donghua/IOTDataDemo.csv').filter(lambda line: line[0:9] != "StationID").map(lambda line: (line.split(",")[3],(float(line.split(",")[4]),1))).reduceByK
   ...: ey(lambda a,b: (a[0]+b[0],a[1]+b[1])).mapValues(lambda v: v[0]/v[1]).sortByKey()
[Stage 0:>                                                          (0 + 2) / 2]18/01/03 09:17:09 WARN scheduler.TaskSetManager: Lost task 1.0 in stage 0.0 (TID 1, cdh-vm.dbaglobe.com, executor 2): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/lib/spark/python/pyspark/worker.py", line 111, in main
    process()
  File "/opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/lib/spark/python/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/lib/spark/python/pyspark/serializers.py", line 133, in dump_stream
    for obj in iterator:
  File "/opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/lib/spark/python/pyspark/rdd.py", line 1703, in add_shuffle_key
    buckets[partitionFunc(k) % numPartitions].append((k, v))
  File "/opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/lib/spark/python/pyspark/rdd.py", line 74, in portable_hash
    raise Exception("Randomness of hash of string should be disabled via PYTHONHASHSEED")
Exception: Randomness of hash of string should be disabled via PYTHONHASHSEED

        at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
        at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
        at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
        at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
        at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:342)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
        at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
        at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
        at org.apache.spark.scheduler.Task.run(Task.scala:89)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:242)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)

How to fix:

Add line “SPARK_YARN_USER_ENV=PYTHONHASHSEED=0” to file “/opt/cloudera/parcels/CDH/lib/spark/conf/spark-env.sh “

[root@cdh-vm conf]# diff /opt/cloudera/parcels/CDH/lib/spark/conf/spark-env.sh /opt/cloudera/parcels/CDH/lib/spark/conf/spark-env.sh.orig
63,66d62
<
< PYSPARK_PYTHON=/opt/anaconda3/envs/py35/bin/python3
< SPARK_YARN_USER_ENV=PYTHONHASHSEED=0
<
79d74
<

(py35) [donghua@cdh-vm ~]$ pyspark
WARNING: User-defined SPARK_HOME (/opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/lib/spark) overrides detected (/opt/cloudera/parcels/CDH/lib/spark).
WARNING: Running pyspark from user-defined location.
Python 3.5.4 |Anaconda, Inc.| (default, Oct 13 2017, 11:22:58)
Type 'copyright', 'credits' or 'license' for more information
IPython 6.1.0 -- An enhanced Interactive Python. Type '?' for help.
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel).
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 1.6.0
      /_/

Using Python version 3.5.4 (default, Oct 13 2017 11:22:58)
SparkContext available as sc, HiveContext available as sqlContext.


In [3]: sc.textFile('/user/donghua/IOTDataDemo.csv').filter(lambda line: line[0:9] != "StationID").map(lambda line: (line.split(",")[3],(float(line.split(",")[4]),1))).reduceByKey(lambda a,b: (a[0]+b[0],a[1]+b[1])).mapValues(lambda v: v[0]/v[1]).sortByKey().collect()

Out[3]:
[('0', 80.42217204861151),
  ('1', 80.42420773058639),
  ('2', 80.516892013888),
  ('3', 80.42997673611161),
  ('4', 80.62740798611237),

  ('5', 80.49621712962933),
  ('6', 80.5453983217595)]


Using Open Source R-Studio Server connecting to Kerberos-enabled Hadoop

$
0
0

Step 1: Add line "SPARK_HOME=${SPARK_HOME-'/opt/cloudera/parcels/CDH/lib/spark/'}" to end of file "/usr/lib64/R/etc/Renviron"

Step 2: Connect to Spark using sparklyr inside R-Studio Server

> install.packages("sparklyr")
> library(sparklyr)
> readRenviron("/usr/lib64/R/etc/Renviron")
> sc <- spark_connect(master = "yarn-client",version = "1.6.0", config = list
(default = list(spark.yarn.keytab = "/home/donghua/donghua.keytab", spark.yarn.principal = "donghua@DBAGLOBE.COM")))
> sc
$master
[1] "yarn-client"

$method
[1] "shell"

$app_name
[1] "sparklyr"

$config
$config$default
$config$default$spark.yarn.keytab
[1] "/home/donghua/donghua.keytab"

$config$default$spark.yarn.principal
[1] "donghua@DBAGLOBE.COM"



$spark_home
[1] "/opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/lib/spark"

$backend
A connection with
description "->localhost:46015"
class "sockconn"
mode "wb"
text "binary"
opened "opened"
can read "yes"
can write "yes"

$monitor
A connection with
description "->localhost:8880"
class "sockconn"
mode "rb"
text "binary"
opened "opened"
can read "yes"
can write "yes"

$output_file
[1] "/tmp/RtmpXWaXfE/file7af1ca61a03_spark.log"

$spark_context
<jobj[6]>
class org.apache.spark.SparkContext
org.apache.spark.SparkContext@355d7d99

$java_context
<jobj[7]>
class org.apache.spark.api.java.JavaSparkContext
org.apache.spark.api.java.JavaSparkContext@ef616c5

attr(,"class")
[1] "spark_connection" "spark_shell_connection" "DBIConnection"


image

> library(DBI)
> iotdatademo <- dbGetQuery(sc, 'Select * from default.iotdatademo limit 10')
> iotdatademo

image

Reference URL: https://medium.com/@bkvarda/sparklyr-r-interface-for-spark-and-kerberos-on-cloudera-80abf5f6b4ad

R: Work with SQL Server using RODBC

$
0
0

> library(RODBC)
> # Connect SQL Server using integration Security
> connStr <- paste("Server=WIN2016\\SQL2017",
+                 "Driver=SQL Server",
+                  "Database=AdventureWorks2017",
+                  sep=";")
> conn <-odbcDriverConnect(connStr)


> tab <- sqlTables(conn)


> head(tab)
           TABLE_CAT    TABLE_SCHEM                TABLE_NAME TABLE_TYPE REMARKS
1 AdventureWorks2017            dbo            AWBuildVersion      TABLE    <NA>
2 AdventureWorks2017            dbo               DatabaseLog      TABLE    <NA>
3 AdventureWorks2017            dbo                  ErrorLog      TABLE    <NA>
4 AdventureWorks2017 HumanResources                Department      TABLE    <NA>
5 AdventureWorks2017 HumanResources                  Employee      TABLE    <NA>
6 AdventureWorks2017 HumanResources EmployeeDepartmentHistory      TABLE    <NA>

> emp <- sqlFetch(conn, "HumanResources.Employee")


> head(emp)
  BusinessEntityID NationalIDNumber                   LoginID OrganizationNode OrganizationLevel
1                1        295847284     adventure-works\\ken0                                 NA
2                2        245797967   adventure-works\\terri0               58                 1
3                3        509647174 adventure-works\\roberto0           5a, c0                 2
4                4        112457891     adventure-works\\rob0           5a, d6                 3
5                5        695256908    adventure-works\\gail0           5a, da                 3
6                6        998320692  adventure-works\\jossef0           5a, de                 3
                       JobTitle  BirthDate MaritalStatus Gender   HireDate SalariedFlag VacationHours SickLeaveHours
1       Chief Executive Officer 1969-01-29             S      M 2009-01-14            1            99             69
2 Vice President of Engineering 1971-08-01             S      F 2008-01-31            1             1             20
3           Engineering Manager 1974-11-12             M      M 2007-11-11            1             2             21
4          Senior Tool Designer 1974-12-23             S      M 2007-12-05            0            48             80
5               Design Engineer 1952-09-27             M      F 2008-01-06            1             5             22
6               Design Engineer 1959-03-11             M      M 2008-01-24            1             6             23
  CurrentFlag                              rowguid ModifiedDate
1           1 F01251E5-96A3-448D-981E-0F99D789110D   2014-06-30
2           1 45E8F437-670D-4409-93CB-F9424A40D6EE   2014-06-30
3           1 9BBBFB2C-EFBB-4217-9AB7-F97689328841   2014-06-30
4           1 59747955-87B8-443F-8ED4-F8AD3AFDF3A9   2014-06-30
5           1 EC84AE09-F9B8-4A15-B4A9-6CCBAB919B08   2014-06-30
6           1 E39056F1-9CD5-478D-8945-14ACA7FBDCDD   2014-06-30

> query <- "select top 10 LoginID, JobTitle from HumanResources.Employee where HireDate > '2010-01-01'"
> sqlQuery(conn,query)
                      LoginID                     JobTitle
1   adventure-works\\ovidiu0         Senior Tool Designer
2   adventure-works\\janice0                Tool Designer
3  adventure-works\\michael8       Senior Design Engineer
4   adventure-works\\sharon0              Design Engineer
5     adventure-works\\john5         Marketing Specialist
6     adventure-works\\mary2          Marketing Assistant
7   adventure-works\\wanida0          Marketing Assistant
8      adventure-works\\kim1 Production Technician - WC60
9       adventure-works\\ed0 Production Technician - WC60
10  adventure-works\\maciej0 Production Technician - WC60

> query <- "select top 10 * from HumanResources.Employee where HireDate > '2010-01-01'"
> df <- sqlQuery(conn,query)[c("LoginID", "JobTitle")]
> df
                     LoginID                     JobTitle
1   adventure-works\\ovidiu0         Senior Tool Designer
2   adventure-works\\janice0                Tool Designer
3  adventure-works\\michael8       Senior Design Engineer
4   adventure-works\\sharon0              Design Engineer
5     adventure-works\\john5         Marketing Specialist
6     adventure-works\\mary2          Marketing Assistant
7   adventure-works\\wanida0          Marketing Assistant
8      adventure-works\\kim1 Production Technician - WC60
9       adventure-works\\ed0 Production Technician - WC60
10  adventure-works\\maciej0 Production Technician - WC60

> dim(df)
[1] 10  2
> > sapply(df,class)
  LoginID JobTitle
"factor" "factor"


> sqlColumns(conn, "HumanResources.Employee")[c("COLUMN_NAME","TYPE_NAME")]
         COLUMN_NAME        TYPE_NAME
1   BusinessEntityID              int
2   NationalIDNumber         nvarchar
3            LoginID         nvarchar
4   OrganizationNode      hierarchyid
5  OrganizationLevel         smallint
6           JobTitle         nvarchar
7          BirthDate             date
8      MaritalStatus            nchar
9             Gender            nchar
10          HireDate             date
11      SalariedFlag             Flag
12     VacationHours         smallint
13    SickLeaveHours         smallint
14       CurrentFlag             Flag
15           rowguid uniqueidentifier
16      ModifiedDate         datetime


> df <- sqlQuery(conn, "select ProductID, avg(UnitPrice),stdev(UnitPrice) from [Sales].[SalesOrderDetail] group by ProductID")
> colnames(df) <- c("ProductID", "Avg(UnitPrice)", "STDEV(UnitPrice)")
> colnames(df)
[1] "ProductID"       "Avg(UnitPrice)"  "STDEV(UnitPrice)"

> names(df)
[1] "ProductID"       "Avg(UnitPrice)"  "STDEV(UnitPrice)"

> head(df)
  ProductID Avg(UnitPrice) STDEV(UnitPrice)
1       925       149.8519    3.315829e-01
2       902       200.0520    0.000000e+00
3       710         5.7000    2.299513e-07
4       879       159.0000    0.000000e+00
5       733       356.8980    1.677983e-05
6       856        53.9073    8.234393e-01

> head(df[1:2],3)
  ProductID Avg(UnitPrice)
1       925       149.8519
2       902       200.0520
3       710         5.7000

> dim(df);ncol(df);nrow(df)
[1] 266   3
[1] 3
[1] 266

# str –> structure, not string

> str(df)
'data.frame':    266 obs. of  3 variables:
  $ ProductID      : int  925 902 710 879 733 856 756 779 802 971 ...
  $ Avg(UnitPrice) : num  149.9 200.1 5.7 159 356.9 ...
  $ STDEV(UnitPrice): num  3.32e-01 0.00 2.30e-07 0.00 1.68e-05 ...


> df[df$`Avg(UnitPrice)`>3000,c("ProductID","Avg(UnitPrice)")]
    ProductID Avg(UnitPrice)
23        750       3270.419
47        753       3035.880
130       751       3326.304
158       752       3290.494
188       749       3170.195


> df[df$`Avg(UnitPrice)`>3000,]
    ProductID Avg(UnitPrice) STDEV(UnitPrice)
23        750       3270.419        588.9196
47        753       3035.880        695.0954
130       751       3326.304        545.7862
158       752       3290.494        574.4163
188       749       3170.195        646.8741

> subset(df,ProductID>750 & `Avg(UnitPrice)`>3000,select=-`STDEV(UnitPrice)`)
    ProductID Avg(UnitPrice)
47        753       3035.880
130       751       3326.304
158       752       3290.494

> df2 <- sqlQuery(conn,"select ProductID,UnitPrice from Sales.SalesOrderDetail")
> summary(df2)
   ProductID       UnitPrice      
  Min.   :707.0   Min.   :   1.328 
  1st Qu.:768.0   1st Qu.:  21.490 
  Median :863.0   Median :  49.990 
  Mean   :841.7   Mean   : 465.093 
  3rd Qu.:921.0   3rd Qu.: 602.346 
  Max.   :999.0   Max.   :3578.270 

> summary(df2$UnitPrice)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.
   1.328   21.490   49.990  465.100  602.300 3578.000
  
> df_emp <- sqlQuery(conn,"select JobTitle,BirthDate from HumanResources.Employee")  

> sqlSave(conn,df_emp,tablename="r_temp1",rownames=FALSE,fast=TRUE)

> sqlDrop(conn,"r_temp1")

> version
_
platform x86_64-w64-mingw32
arch x86_64
os mingw32
system x86_64, mingw32
status
major 3
minor 3.3
year 2017
month 03
day 06
svn rev 72310
language R
version.string R version 3.3.3 (2017-03-06)
nickname Another Canoe

> library(help="RODBC")

        Information on package ‘RODBC’

Description:

Package:              RODBC
Version:              1.3-15
Revision:             $Rev: 3476 $
Date:                 2017-04-13
Authors@R:            c(person("Brian", "Ripley", role = c("aut", "cre"), email = "ripley@stats.ox.ac.uk"),
                      person("Michael", "Lapsley", role = "aut", comment = "1999 to Oct 2002"))
Title:                ODBC Database Access
Description:          An ODBC database interface.
SystemRequirements:   An ODBC3 driver manager and drivers.
Depends:              R (>= 3.0.0)
Imports:              stats
LazyLoad:             yes
Biarch:               yes
License:              GPL-2 | GPL-3
NeedsCompilation:     yes
Packaged:             2017-04-13 07:00:50 UTC; ripley
Author:               Brian Ripley [aut, cre], Michael Lapsley [aut] (1999 to Oct 2002)
Maintainer:           Brian Ripley <ripley@stats.ox.ac.uk>
Repository:           CRAN
Date/Publication:     2017-04-13 07:04:28 UTC
Built:                R 3.3.2; x86_64-w64-mingw32; 2017-04-28 16:33:43 UTC; windows

Index:

RODBC                   ODBC Database Connectivity
odbcClose               ODBC Close Connections
odbcConnect             ODBC Open Connections
odbcDataSources         List ODBC Data Sources
odbcGetInfo             Request Information on an ODBC Connection
odbcQuery               Low-level ODBC functions
odbcSetAutoCommit       ODBC Set Auto-Commit Mode
setSqlTypeInfo          Specify or Query a Mapping of R Types to DBMS
                        Types
sqlColumns              Query Column Structure in ODBC Tables
sqlCopy                 ODBC Copy
sqlDrop                 Deletion Operations on Tables in ODBC databases
sqlFetch                Reading Tables from ODBC Databases
sqlQuery                Query an ODBC Database
sqlSave                 Write a Data Frame to a Table in an ODBC
                        Database
sqlTables               List Tables on an ODBC Connection
sqlTypeInfo             Request Information about Data Types in an ODBC
                         Database

Further information is available in the following vignettes in directory ‘C:/Program Files/Microsoft SQL
Server/140/R_SERVER/library/RODBC/doc’:

RODBC: ODBC Connectivity (source, pdf)

Sample configuration using nginx LB to access Cloudera Manager

$
0
0
[root@localhost ~]# cat /etc/nginx/conf.d/cloudera.conf
server {
    listen 7180;
    location / {
    proxy_pass http://clouderacm;
  # May not need or want to set Host. Should default to the above hostname.
  proxy_set_header          Host            $http_host;
  proxy_set_header          X-Forwarded-For  $remote_addr;
    }
}

upstream clouderacm {
    hash $remote_addr consistent;

    server cdh-vm.dbaglobe.com:7180;
}
}

HTTPFS & WebHDFS API Examples

$
0
0
# Create directory /user/donghua/newdir
# user.name=donghua for authentication purpose
[donghua@cdh-vm ~]$ curl -i -X PUT "http://cdh-vm:14000/webhdfs/v1/user/donghua/newdir?op=MKDIRS&user.name=donghua"
HTTP/1.1 200 OK
Server: Apache-Coyote/1.1
Set-Cookie: hadoop.auth="u=donghua&p=donghua&t=simple-dt&e=1516447651395&s=wq4TMpf9TNtSCrFiqs1Tam66ZzQ="; Path=/; HttpOnly
Content-Type: application/json
Transfer-Encoding: chunked
Date: Sat, 20 Jan 2018 01:27:31 GMT

{"boolean":true}

# List directory & Files in /user/donghua
[donghua@cdh-vm ~]$ curl -i -X GET "http://cdh-vm:14000/webhdfs/v1/user/donghua/?op=LISTSTATUS&user.name=donghua"
HTTP/1.1 200 OK
Server: Apache-Coyote/1.1
Set-Cookie: hadoop.auth="u=donghua&p=donghua&t=simple-dt&e=1516447856962&s=aZyV3CQ997aG09J+mQQsC4FDRjU="; Path=/; HttpOnly
Content-Type: application/json
Transfer-Encoding: chunked
Date: Sat, 20 Jan 2018 01:30:57 GMT

{"FileStatuses":{"FileStatus":[{"pathSuffix":".Trash","type":"DIRECTORY","length":0,"owner":"donghua","group":"supergroup","permission":"700","accessTime":0,"modificationTime":1515927600117,"blockSize":0,"replication":0},{"pathSuffix":".sparkStaging","type":"DIRECTORY","length":0,"owner":"donghua","group":"supergroup","permission":"755","accessTime":0,"modificationTime":1514001258000,"blockSize":0,"replication":0},{"pathSuffix":".staging","type":"DIRECTORY","length":0,"owner":"donghua","group":"supergroup","permission":"700","accessTime":0,"modificationTime":1514379310197,"blockSize":0,"replication":0},{"pathSuffix":"mkdir","type":"DIRECTORY","length":0,"owner":"donghua","group":"supergroup","permission":"755","accessTime":0,"modificationTime":1516411617837,"blockSize":0,"replication":0},{"pathSuffix":"monthly_taxi_fleet","type":"DIRECTORY","length":0,"owner":"donghua","group":"supergroup","permission":"755","accessTime":0,"modificationTime":1514024183300,"blockSize":0,"replication":0},{"pathSuffix":"monthly_taxi_fleet.ddl","type":"FILE","length":729,"owner":"donghua","group":"supergroup","permission":"644","accessTime":1516411543045,"modificationTime":1516411543262,"blockSize":134217728,"replication":1},{"pathSuffix":"newdir","type":"DIRECTORY","length":0,"owner":"donghua","group":"supergroup","permission":"755","accessTime":0,"modificationTime":1516411651448,"blockSize":0,"replication":0},{"pathSuffix":"newdir2","type":"DIRECTORY","length":0,"owner":"donghua","group":"supergroup","permission":"755","accessTime":0,"modificationTime":1516411701096,"blockSize":0,"replication":0}]}}

# Get file contents
[donghua@cdh-vm ~]$ curl -i -X GET "http://cdh-vm:14000/webhdfs/v1/user/donghua/monthly_taxi_fleet.ddl?op=OPEN&user.name=donghua"
HTTP/1.1 200 OK
Server: Apache-Coyote/1.1
Set-Cookie: hadoop.auth="u=donghua&p=donghua&t=simple-dt&e=1516447887889&s=nq4WkUAsHUMl06yPMecNmUS/s38="; Path=/; HttpOnly
Content-Type: application/octet-stream
Content-Length: 729
Date: Sat, 20 Jan 2018 01:31:27 GMT

createtab_stmt
CREATE EXTERNAL TABLE `monthly_taxi_fleet`(
  `month` char(7), 
  `company` varchar(50), 
  `fleet` smallint)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
WITH SERDEPROPERTIES ( 
  'field.delim'=',', 
  'serialization.format'=',') 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.mapred.TextInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
  'hdfs://cdh-vm.dbaglobe.com:8020/user/donghua/monthly_taxi_fleet'
TBLPROPERTIES (
  'COLUMN_STATS_ACCURATE'='false', 
  'numFiles'='1', 
  'numRows'='-1', 
  'rawDataSize'='-1', 
  'skip.header.line.count'='1', 
  'totalSize'='25802', 
  'transient_lastDdlTime'='1514024883')
  
# Get file Status
donghua@cdh-vm ~]$ curl -i -X GET "http://cdh-vm:14000/webhdfs/v1/user/donghua/monthly_taxi_fleet.ddl?op=GETFILESTATUS&user.name=donghua"
HTTP/1.1 200 OK
Server: Apache-Coyote/1.1
Set-Cookie: hadoop.auth="u=donghua&p=donghua&t=simple-dt&e=1516448141481&s=VmoBQqQUjfd/DryQ4WctTLwoPcw="; Path=/; HttpOnly
Content-Type: application/json
Transfer-Encoding: chunked
Date: Sat, 20 Jan 2018 01:35:41 GMT

{"FileStatus":{"pathSuffix":"","type":"FILE","length":729,"owner":"donghua","group":"supergroup","permission":"644","accessTime":1516411543045,"modificationTime":1516411543262,"blockSize":134217728,"replication":1}}

# Delete file
[donghua@cdh-vm ~]$ curl -i -X DELETE "http://cdh-vm:14000/webhdfs/v1/user/donghua/monthly_taxi_fleet.ddl?op=DELETE&user.name=donghua"
HTTP/1.1 200 OK
Server: Apache-Coyote/1.1
Set-Cookie: hadoop.auth="u=donghua&p=donghua&t=simple-dt&e=1516448233112&s=aIVcU49l57oWd5QesPBchhaYiTM="; Path=/; HttpOnly
Content-Type: application/json
Transfer-Encoding: chunked
Date: Sat, 20 Jan 2018 01:37:13 GMT


donghua@cdh-vm ~]$ curl -i -X GET "http://cdh-vm:14000/webhdfs/v1/user/donghua/monthly_taxi_fleet.ddl?op=GETFILESTATUS&user.name=donghua"
HTTP/1.1 404 Not Found
Server: Apache-Coyote/1.1
Set-Cookie: hadoop.auth="u=donghua&p=donghua&t=simple-dt&e=1516448282348&s=rHkMNLdVJe/gN6Wouy+lqon257Q="; Path=/; HttpOnly
Content-Type: application/json
Transfer-Encoding: chunked
Date: Sat, 20 Jan 2018 01:38:02 GMT

{"RemoteException":{"message":"File does not exist: \/user\/donghua\/monthly_taxi_fleet.ddl","exception":"FileNotFoundException","javaClassName":"java.io.FileNotFoundException"}}

# Delete directory recursively
[donghua@cdh-vm ~]$ curl -i -X DELETE "http://cdh-vm:14000/webhdfs/v1/user/donghua/newdir?op=DELETE&recursive=true&user.name=donghua"
HTTP/1.1 200 OK
Server: Apache-Coyote/1.1
Set-Cookie: hadoop.auth="u=donghua&p=donghua&t=simple-dt&e=1516448379021&s=OTgN6OcRp7R1DFYPTxHrTdyUhRk="; Path=/; HttpOnly
Content-Type: application/json
Transfer-Encoding: chunked
Date: Sat, 20 Jan 2018 01:39:39 GMT


WebHDFS uses port 50070 & enabled by default:


[donghua@cdh-vm ~]$ curl -i -X GET "http://cdh-vm:50070/webhdfs/v1/user/donghua/?op=LISTSTATUS&user.name=donghua"
HTTP/1.1 200 OK
Cache-Control: no-cache
Expires: Sat, 20 Jan 2018 01:46:47 GMT
Date: Sat, 20 Jan 2018 01:46:47 GMT
Pragma: no-cache
Expires: Sat, 20 Jan 2018 01:46:47 GMT
Date: Sat, 20 Jan 2018 01:46:47 GMT
Pragma: no-cache
Content-Type: application/json
X-FRAME-OPTIONS: SAMEORIGIN
Set-Cookie: hadoop.auth="u=donghua&p=donghua&t=simple&e=1516448807623&s=/yAvIL3eIfNFMIBeKCjJkD2JJk0="; Path=/; HttpOnly
Transfer-Encoding: chunked

{"FileStatuses":{"FileStatus":[
{"accessTime":0,"blockSize":0,"childrenNum":0,"fileId":22335,"group":"supergroup","length":0,"modificationTime":1515927600117,"owner":"donghua","pathSuffix":".Trash","permission":"700","replication":0,"storagePolicy":0,"type":"DIRECTORY"},
{"accessTime":0,"blockSize":0,"childrenNum":0,"fileId":19734,"group":"supergroup","length":0,"modificationTime":1514001258000,"owner":"donghua","pathSuffix":".sparkStaging","permission":"755","replication":0,"storagePolicy":0,"type":"DIRECTORY"},
{"accessTime":0,"blockSize":0,"childrenNum":1,"fileId":19354,"group":"supergroup","length":0,"modificationTime":1514379310197,"owner":"donghua","pathSuffix":".staging","permission":"700","replication":0,"storagePolicy":0,"type":"DIRECTORY"},
{"accessTime":0,"blockSize":0,"childrenNum":1,"fileId":33209,"group":"supergroup","length":0,"modificationTime":1514024183300,"owner":"donghua","pathSuffix":"monthly_taxi_fleet","permission":"755","replication":0,"storagePolicy":0,"type":"DIRECTORY"}

How to fix "Service Monitor" and "Host Monitor" failure during Cloudera CDH5 cluster restart

$
0
0
Symptoms:

Request to the Service Monitor failed. This may cause slow page responses. View the status of the Service Monitor.
Request to the Host Monitor failed. This may cause slow page responses. View the status of the Host Monitor.

How to Fix:

Cloudera Management Service : -> Configuration

Descriptor Fetch Tries Interval -> Increase from default value 2 to 5

The interval between fetch tries for SCM descriptor when Cloudera Management Service roles are starting.

Descriptor Fetch Max Tries -> Increase from default value 5 to 60


Maximum number of tries to fetch SCM descriptor when Cloudera Management Service roles are starting. If the roles are not able to get the descriptor in these many tries, then they exit.



Reference:

[root@cdh-vm cloudera-scm-firehose]# grep "5 sec" mgmt-cmf-mgmt-SERVICEMONITOR-cdh-vm.dbaglobe.com.log.out
2018-01-20 09:54:10,687 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 1 tries, sleeping for 5 secs
2018-01-20 09:54:15,722 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 2 tries, sleeping for 5 secs
2018-01-20 09:54:20,733 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 3 tries, sleeping for 5 secs
2018-01-20 09:54:25,743 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 4 tries, sleeping for 5 secs
2018-01-20 09:54:30,744 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 5 tries, sleeping for 5 secs
2018-01-20 09:54:35,745 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 6 tries, sleeping for 5 secs
2018-01-20 09:54:40,753 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 7 tries, sleeping for 5 secs
2018-01-20 09:54:45,761 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 8 tries, sleeping for 5 secs
2018-01-20 09:54:50,764 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 9 tries, sleeping for 5 secs
2018-01-20 09:54:55,767 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 10 tries, sleeping for 5 secs
2018-01-20 09:55:00,773 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 11 tries, sleeping for 5 secs
2018-01-20 09:55:05,789 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 12 tries, sleeping for 5 secs
2018-01-20 09:55:10,792 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 13 tries, sleeping for 5 secs
2018-01-20 09:55:15,796 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 14 tries, sleeping for 5 secs
2018-01-20 09:55:20,799 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 15 tries, sleeping for 5 secs
2018-01-20 09:55:25,802 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 16 tries, sleeping for 5 secs
2018-01-20 09:55:30,805 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 17 tries, sleeping for 5 secs
2018-01-20 09:55:35,813 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 18 tries, sleeping for 5 secs
2018-01-20 09:55:40,818 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 19 tries, sleeping for 5 secs
2018-01-20 09:55:45,824 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 20 tries, sleeping for 5 secs
2018-01-20 09:55:50,825 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 21 tries, sleeping for 5 secs
2018-01-20 09:55:55,828 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 22 tries, sleeping for 5 secs
2018-01-20 09:56:00,836 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 23 tries, sleeping for 5 secs
2018-01-20 09:56:05,837 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 24 tries, sleeping for 5 secs
2018-01-20 09:56:10,846 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 25 tries, sleeping for 5 secs
2018-01-20 09:56:15,852 WARN com.cloudera.cmon.firehose.Main: No descriptor fetched from http://cdh-vm.dbaglobe.com:7180 on after 26 tries, sleeping for 5 secs
[root@cdh-vm cloudera-scm-firehose]# grep "5 sec" mgmt-cmf-mgmt-SERVICEMONITOR-cdh-vm.dbaglobe.com.log.out|wc -l

26




Viewing all 604 articles
Browse latest View live