ansible playbook

[root@ ansible]# more sendmail.yml

– hosts: hadoop_sit
user: root
tasks:
– name: 1. Install sendmail
yum:  name=sendmail state=latest

– name: 2. Start sendmail
service: name=sendmail  state=running enabled=yes

simple copy file playbock

Command to run ansible playbook
1
ansible-playbook demo.yml
where ‘demo.yml’ is playbook name

Dry Run mode
1
ansible-playbook demo.yml –check

[ovi@~]$ more tsm.yml

– hosts: endur_dev
tasks:
– name: Copy the file
copy: src=/tmp/file.txt dest=/tmp/file.txt

[ovi@ ~]$ ansible-playbook tsm.yml –ask-pass
SSH password:

PLAY [endur_dev] **************************************************************

GATHERING FACTS ***************************************************************
ok: [ndora01.uat.my.com]
ok: [ndora01.dev.my.com]

TASK: [Copy the file] *********************************************************
changed: [ndora01.dev.my.com]
changed: [ndora01.uat.my.com]

PLAY RECAP ********************************************************************
ndora01.dev.bmocm.com : ok=2 changed=1 unreachable=0 failed=0
ndora01.uat.bmocm.com : ok=2 changed=1 unreachable=0 failed=0

another simple playbook

[ovi@ ~]$ more tsm2.yml

– hosts: endur_dev
tasks:
– name: Copy the file
copy: src=/tmp/file1.txt dest=/tmp/file1.txt

– name: Copy second file
copy: src=/tmp/file2.txt dest=/tmp/file2.txt

ansible

Ansible’s Feature:

  • Accessed mostly through SSH ( it also has a paraminko and local modes)
  • Based on an agent less architecture
  • Have more than 200 built-in modules
  • No custom infrastructure required
  • Configuration (module, playbook) written in the easy to use YML format
  • Ansible interacts with its clients either through playbooks or a command line tool ( ad-hoc command)

PARAMINKOhigh-quality Python implementation of OpenSSH

Ansible components 

  • Inventory
  • Playbooks
    • Play
    • Tasks
    • Roles
    • Handlers
    • Templates
    • Variables

 

Example Ad-Hoc commands

To transfer a file directly to many servers:

$ ansible hadoop  -m copy -a “src=/etc/hosts dest=/tmp/hosts”

To ping the servers

[ovi@ ~]$  ansible last_bpm -m ping  –ask-pass
SSH password:
192.168.18.207 | success >> {
“changed”: false,
“ping”: “pong”
}

192.168.18.208 | success >> {
“changed”: false,
“ping”: “pong”
}

192.168.18.206 | success >> {
“changed”: false,
“ping”: “pong”
}

Run ansible ad-hoc command to check OS

[root@ ansible]# ansible hadoop_dev -m command -a “uname -a” –ask-pass
SSH password:
192.168.68.119 | success | rc=0 >>
Linux dphdmst04 2.6.32-504.23.4.el6.x86_64 #1 SMP Fri May 29 10:16:43 EDT 2015 x86_64 x86_64 x86_64 GNU/Linux

192.168.68.118 | success | rc=0 >>
Linux dphdmst03 2.6.32-504.23.4.el6.x86_64 #1 SMP Fri May 29 10:16:43 EDT 2015 x86_64 x86_64 x86_64 GNU/Linux

192.168.68.117 | success | rc=0 >>
Linux dphdmst02 2.6.32-504.23.4.el6.x86_64 #1 SMP Fri May 29 10:16:43 EDT 2015 x86_64 x86_64 x86_64 GNU/Linux

192.168.68.116 | success | rc=0 >>
Linux dphdmst01 2.6.32-504.23.4.el6.x86_64 #1 SMP Fri May 29 10:16:43 EDT 2015 x86_64 x86_64 x86_64 GNU/Linux

Restart ntpd service on one server

# ansible 192.168.129.61 -m service -a “name=ntpd state=restarted” -k

SSH password:

192.168.129.61 | success >> {

“changed”: true,

“name”: “ntpd”,

“state”: “started”

}

[root@]# ansible hadoop -m shell -a “ps -e -o pcpu,pid,user,args|sort -k1 -nr|head -1” -k

SSH password:

192.236.1.52 | success | rc=0 >>

99.9 29068 root     python /usr/bin/goferd

192.236.1.56 | success | rc=0 >>

12.9   7321 root     /opt/microsoft/configmgr/bin/ccmexec.binsort: write failed:

192.236.1.53 | success | rc=0 >>

99.9 543544 root     python /usr/bin/goferd

192.236.1.54 | success | rc=0 >>

99.2 567756 root     python /usr/bin/goferd

192.236.1.55 | success | rc=0 >>

8.0 65506 root     sshd: root@notty

192.236.1.57 | success | rc=0 >>

15.6   7260 root     /opt/microsoft/configmgr/bin/ccmexec.binsort: write failed: standard output: Broken pipe

192.236.1.58 | success | rc=0 >>

100 491426 root     python /usr/bin/goferdsort: fflush failed: standard output: Broken

192.236.1.59 | success | rc=0 >>

100 485478 root     python /usr/bin/goferd

192.236.1.61 | success | rc=0 >>

100 463591 root     python /usr/bin/goferdsort: fflush failed: standard output: Broken pipe

192.236.1.60 | success | rc=0 >>

20.8   7263 root     /opt/microsoft/configmgr/bin/ccmexec.binsort: fflush failed:

manage services

# ansible hadoop_prod -m service -a “name=goferd state=restarted” -k
SSH password:

192.236.1.56 | success >> {
“changed”: true,
“name”: “goferd”,
“state”: “started”
}

192.236.1.52 | success >> {
“changed”: true,
“name”: “goferd”,
“state”: “started”
}

192.236.1.53 | success >> {
“changed”: true,
“name”: “goferd”,
“state”: “started”
}

192.236.1.54 | success >> {
“changed”: true,
“name”: “goferd”,
“state”: “started”
}

192.236.1.57 | success >> {
“changed”: true,
“name”: “goferd”,
“state”: “started”
}

192.236.1.58 | success >> {
“changed”: true,
“name”: “goferd”,
“state”: “started”
}

192.236.1.60 | success >> {
“changed”: true,
“name”: “goferd”,
“state”: “started”
}

192.236.1.61 | success >> {
“changed”: true,
“name”: “goferd”,
“state”: “started”
}

192.236.1.59 | success >> {
“changed”: true,
“name”: “goferd”,
“state”: “started”
}

gathering facts

[ovi ~]$  ansible last_bpm -a “free -m” –ask-pass
SSH password:
192.168.18.207 | success | rc=0 >>
total       used       free     shared    buffers     cached
Mem:         15951       3823      12128          0        237       1199
-/+ buffers/cache:       2385      13565
Swap:        16383          0      16383

192.168.18.208 | success | rc=0 >>
total       used       free     shared    buffers     cached
Mem:         15951       2540      13410          0        214        261
-/+ buffers/cache:       2063      13887
Swap:        16383          0      16383

192.168.18.206 | success | rc=0 >>
total       used       free     shared    buffers     cached
Mem:         15951       1245      14706          0        168        216
-/+ buffers/cache:        860      15090
Swap:        16383          0      16383

[ovi ~]$ ansible hadoop_prod -m file -a “dest=/tmp/testansible mode=644 owner=ovi group=130 state=directory” –ask-pass

192.236.1.60 | success >> {
“changed”: true,
“gid”: 130,
“group”: “sysadmin”,
“mode”: “0644”,
“owner”: “ovi”,
“path”: “/tmp/testansible”,
“size”: 4096,
“state”: “directory”,
“uid”: 275
}

192.236.1.61 | success >> {
“changed”: true,
“gid”: 130,
“group”: “sysadmin”,
“mode”: “0644”,
“owner”: “ovi”,
“path”: “/tmp/testansible”,
“size”: 4096,
“state”: “directory”,
“uid”: 275
}

$ ansible all -m setup

[ovi@C ~]$ ansible endur_dev -a “sudo yum list openssh” –ask-pass
SSH password:
dora01.uat.my.com | success | rc=0 >>
Loaded plugins: package_upload, product-id, security, subscription-manager
Available Packages
openssh.x86_64 5.3p1-112.el6_7 rhel-6-server-rpms

dora02.dev.my.com | success | rc=0 >>
Loaded plugins: package_upload, product-id, security, subscription-manager
Available Packages
openssh.x86_64 5.3p1-112.el6_7 rhel-6-server-rpms

copy a file to server

[ovi@ ~]$ ansible endur_dev -m copy -a “src=/etc/hosts dest=/tmp/hosts” –ask-pass
SSH password:
dora01.dev.my.com | success >> {
“changed”: true,
“dest”: “/tmp/hosts”,
“gid”: 130,
“group”: “sysadmin”,
“md5sum”: “bf17964d25f8802d53ee22d97edb8d4e”,
“mode”: “0644”,
“owner”: “oasimin”,
“size”: 132,
“src”: “/tmp/ansible-1456345034.31-198250330572361/source”,
“state”: “file”,
“uid”: 275
}

dora02.uat.my.com | success >> {
“changed”: true,
“dest”: “/tmp/hosts”,
“gid”: 130,
“group”: “sysadmin”,
“md5sum”: “bf17964d25f8802d53ee22d97edb8d4e”,
“mode”: “0644”,
“owner”: “oasimin”,
“size”: 132,
“src”: “/tmp/ansible-1456345034.3-35469159385891/source”,
“state”: “file”,
“uid”: 275
}

 

Command to run ansible playbook

1
ansible-playbook ovi.yml

where ‘ovi.yml’ is playbook name

Dry Run mode

1
ansible-playbook ovi.yml --check

sqoop

Sqoop is a tool designed to transfer data between Hadoop and relational databases. You can use Sqoop to import data from a relational database management system (RDBMS) such as MySQL or Oracle into the Hadoop Distributed File System (HDFS), transform the data in Hadoop MapReduce, and then export the data back into an RDBMS

$ sqoop list-databases –connect jdbc:postgresql://phdmst:5432 –username gpadmin –password  xxxxxx
15/06/05 14:19:29 WARN tool.BaseSqoopTool: Setting your password on the command-line is insecure. Consider using -P instead.
15/06/05 14:19:29 INFO manager.SqlManager: Using default fetchSize of 1000
template0
template1
postgres
gpadmin
ovi
RCDB

-bash-4.1$ sqoop list-databases –connect jdbc:postgresql://uphdmst02.uat.mydev.com:5432 –username gpadmin –password xxxxxx
15/08/04 14:20:05 WARN tool.BaseSqoopTool: Setting your password on the command-line is insecure. Consider using -P instead.
15/08/04 14:20:05 INFO manager.SqlManager: Using default fetchSize of 1000
template0
template1
postgres
gpadmin
CCDM
testcmri
benchmark
testovi
ITS

Netezza Connector

Netezza connector for Sqoop is an implementation of the Sqoop connector interfaces for accessing a Netezza data warehouse appliance, so that data can be exported and imported to a Hadoop environment from Netezza data warehousing environments.

The HDP 2 Sqoop distribution includes Netezza connector software. To deploy it, the only requirement is that you acquire the JDBC jar file (named nzjdbc.jar) from IBM and copy it to the /usr/local/nz/lib directory.

Here is an example of a complete command line for import using the Netezza external table feature:

$ sqoop import \
 --direct \
 --connect jdbc:netezza://nzhost:5480/sqoop \
 --table nztable \
 --username nzuser \
 --password nzpass \
 --target-dir hdfsdir \
 -- --log-dir /tmp

Here is an example of a complete command line for export with tab (\t) as the field terminator character:

$ sqoop export \
 --direct \
 --connect jdbc:netezza://nzhost:5480/sqoop \
 --table nztable \
 --username nzuser \
 --password nzpass \
 --export-dir hdfsdir \
 --input-fields-terminated-by "\t"

Hawk

HAWQ is the new benchmark for SQL on Hadoop,

HAWQ is a parallel SQL query engine . HAWQ has been designed from the ground up to be a massively parallel SQL processing engine optimized specifically for analytics with full transaction support

Pivotal HAWQ is a Massively Parallel Processing (MPP)  database using several Postgres database instances and HDFS storage

Hawq Physical Architecture 

  1. Hawq Master server
  2. HDFS NameNode
  3. Segment server
  4. Interconnect Switch

How to locate the logs:

HAWQ master logs.

[gpadmin@uphdmst02 gpseg-1]$ psql
psql (8.4.20, server 8.2.15)
WARNING: psql version 8.4, server version 8.2.
Some psql features might not work.
Type “help” for help.

gpadmin=# show data_directory;
data_directory
———————-
/data/master/gpseg-1
(1 row)

[gpadmin@uphdmst02 ~]$ psql
psql (8.4.20, server 8.2.15)
WARNING: psql version 8.4, server version 8.2.
Some psql features might not work.
Type “help” for help.

gpadmin=# select version();
version

————————————————————————-
————————————————-
PostgreSQL 8.2.15 (Greenplum Database 4.2.0 build 1) (HAWQ 1.2.1.0 build 10335) on x86_64-unknown-linux-gnu, comp
iled by GCC gcc (GCC) 4.4.2 compiled on Aug  8 2014 16:31:48
(1 row)

Create some external table in uat

gpadmin=# CREATE EXTERNAL TABLE person ( id int, name text)

gpadmin-# location(‘gpfdist://phdmst03.uat.mydev.com:8000/Test/person.txt’) FORMAT ‘text’ (delimiter ‘|’)

gpadmin-# ENCODING ‘UTF8’;

CREATE EXTERNAL TABLE

gpadmin=# select count(*) from person;

count

———

1000000

(1 row)

gpadmin=#

— External Table: ext_sim_result_value_f

— DROP EXTERNAL TABLE ext_sim_result_value_f;

CREATE EXTERNAL TABLE ext_sim_result_value_f
(
sim_result_id ,
md_point_id ,
path_num ,
value
)
LOCATION (
‘gpfdist://uphdmst03.uat.mydev.com:8000/500041_Sim_Result_Value_F.csv.gz’
)
FORMAT ‘text’ (delimiter ‘|’ null ‘\\N’ escape ‘\\’)
ENCODING ‘UTF8’;
ALTER TABLE ext_sim_result_value_f
OWNER TO gpadmin;

shutdown

 

3
20150731:16:56:16:570576 gpstop:uphdmst02:gpadmin-[INFO]:-Commencing parallel segment instance shutdown, please wait…
…………….
20150731:16:56:32:570576 gpstop:uphdmst02:gpadmin-[INFO]:—————————————————–
20150731:16:56:32:570576 gpstop:uphdmst02:gpadmin-[INFO]:-   Segments stopped successfully      = 24
20150731:16:56:32:570576 gpstop:uphdmst02:gpadmin-[INFO]:-   Segments with errors during stop   = 0
20150731:16:56:32:570576 gpstop:uphdmst02:gpadmin-[INFO]:—————————————————–
20150731:16:56:32:570576 gpstop:uphdmst02:gpadmin-[INFO]:-Successfully shutdown 24 of 24 segment instances
20150731:16:56:32:570576 gpstop:uphdmst02:gpadmin-[INFO]:-Database successfully shutdown with no errors reported

 Hawq – pga_conf

vim /data/master/gpseg-1/pg_hba.conf

source /usr/local/hawq/greenplum_path.sh

export MASTER_DATA_DIRECTORY=/data/master/gpseg-1

reload the configuration

[gpadmin@uphdmst02 ~]$ gpstop -u
20150805:11:05:40:233745 gpstop:uphdmst02:gpadmin-[INFO]:-Starting gpstop with args: -u
20150805:11:05:40:233745 gpstop:uphdmst02:gpadmin-[INFO]:-Gathering information and validating the environment…
20150805:11:05:40:233745 gpstop:uphdmst02:gpadmin-[INFO]:-Obtaining Greenplum Master catalog information
20150805:11:05:40:233745 gpstop:uphdmst02:gpadmin-[INFO]:-Obtaining Segment details from master…
20150805:11:05:41:233745 gpstop:uphdmst02:gpadmin-[INFO]:-Greenplum Version: ‘postgres (HAWQ) 4.2.0 build 1’
20150805:11:05:41:233745 gpstop:uphdmst02:gpadmin-[INFO]:-Signalling all postmaster processes to reload

pg_hba.conf

host    all     gpadmin 192.168.68.135/32        trust
host    all     gpadmin 192.168.68.135/32        trust
host    all     all     192.193.68.132/32        trust
host     all         gpadmin         10.115.yyy.0/24    trust
host     all         gpadmin         10.115.yyy.0/24    trust
host     all         aafsh02          10.115.zzz.0/24   trust
#host     all         all              10.115.xx.xx/32   trust
local    all         gpadmin         ident
host     all         gpadmin         127.0.0.1/28    trust
host     all         gpadmin         192.193.68.134/32       trust
host     all         gpadmin         ::1/128       trust
host     all         gpadmin         fe80::3aea:a7ff:fe35:c0c/128       trust
host     all         gpadmin         192.168.68.136/32       trust
host     all         gpadmin         10.110.xxx.0/24    trust
#host     all         gpadmin         10.115.xxx.0/24    trust
#host     all         gpadmin         10.115.xxx.0/24    trust
host     all         all             10.115.xxx.0/24    ldap ldapserver=xxx.225.227.15 ldapprefix=”OFFICE\”
host     all         all             10.110.xxx.0/24     ldap ldapserver=xxx.225.227.15 ldapprefix=”OFFICE\”
host     all         all             10.202.xxx.0/24     ldap ldapserver=xxx.225.227.15 ldapprefix=”OFFICE\”
host     benchmark   fsh02         10.115.xxx.20/32  ldap ldapserver=xxx.225.227.yyy ldapprefix=”OFFICE\”
host     benchmark   fsh02         10.193.xxx.yyy/32  ldap ldapserver=xxx.225.227.yyy ldapprefix=”OFFICE\”

Create user in hawq

padmin=# CREATE USER ong01 WITH LOGIN ;

gpadmin=# CREATE USER user1 WITH LOGIN ;

gpadmin=# \du
List of roles
Role name         |            Attributes                                               |  Member of
———— —–+———————————————+————-
CM_Admin       | Cannot login                                                     |
RD_Admin        | Cannot login                                                     |
RD_Users          | Cannot login                                                     |
ovi02                   | Superuser, Create DB                                 |
ong01                  |                                                                                   |
gpadmin              | Superuser, Create role, Create DB     |
user2                    |                                                                                    |
ovi                          |                                                                                     |
rd_user                |                                                                                    | {RD_Users}
user1                    |                                                                                    |

 

gpadmin-# \l
List of databases
Name    |  Owner   | Encoding |   Access privileges
———–+———-+———-+———————–
APMS      | gpadmin  | UTF8     |
B_POC | gpadmin  | UTF8        | =Tc/gpadmin
: gpadmin=CTc/gpadmin
: gorodn=CTc/gpadmin
: asi=CTc/gpadmin
CCD      | gpadmin  | UTF8        |
CI_DB   | usertest | UTF8          | usertest=CTc/usertest
: ovi=CTc/usertest
Mike_Test | gpadmin  | UTF8     | =Tc/gpadmin
: gpadmin=CTc/gpadmin
: ovi=CTc/gpadmin

 

GRANT ALL PRIVILEGES

ON TABLE sim_result_d2, sim_result_value_f

TO PUBLIC

So for now they can access the tables I created.

 

Configure the Capacity Scheduler

The CapacityScheduler is designed to run Hadoop applications as a shared, multi-tenant cluster in an operator-friendly manner while maximizing the throughput and the utilization of the cluster.

The CapacityScheduler is designed to allow sharing a large cluster while giving each organization a minimum capacity guarantee

To configure the ResourceManager to use the CapacityScheduler, set the following property in the conf/yarn-site.xml:

<property>
<name>yarn.resourcemanager.scheduler.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler</value>
</property>

Each child queue is tied to its parent queue with the yarn.scheduler.capacity.<queue-path>.queues configuration property in the capacity-scheduler.xml file

<property>
<name>yarn.scheduler.capacity.root.queues</name>
<value>default</value>
<description>
The queues at the this level (root is the root queue).
</description>

<property>

The Capacity Scheduler reads this file when starting,when you modifies the capacity-scheduler.xml file you have to reloads the settings by running the following command:

yarn rmadmin -refreshQueues

After successful completion of the above command, you may verify if the queues are setup using below  command:

-bash-4.1$ hadoop queue -list
DEPRECATED: Use of this script to execute mapred command is deprecated.
Instead use the mapred command for it.

15/08/14 16:31:20 INFO client.RMProxy: Connecting to ResourceManager at sphdmst03.dev.bmocm.com/192.168.68.131:8032
======================
Queue Name : default
Queue State : running
Scheduling Info : Capacity: 100.0, MaximumCapacity: 100.0, CurrentCapacity: 0.0

use the below command to identify the queue names on which you could submit your jobs.

-bash-4.1$ hadoop queue -showacls
DEPRECATED: Use of this script to execute mapred command is deprecated.
Instead use the mapred command for it.

15/08/14 16:35:11 INFO client.RMProxy: Connecting to ResourceManager at sphdmst03.dev.bmocm.com/192.168.68.131:8032
Queue acls for user :  gpadmin

Queue  Operations
=====================
root  ADMINISTER_QUEUE,SUBMIT_APPLICATIONS
default  ADMINISTER_QUEUE,SUBMIT_APPLICATIONS

Hadoop Certifications

Hadoop system administrators Certification 1.  Cloudera http://www.cloudera.com/content/cloudera/en/training/certification/ccah/prep.html  

2. Hortonworks

** Installation

Configure a local HDP repository
Install ambari-server and ambari-agent
Install HDP using the Ambari install wizard
Add a new node to an existing cluster
Decommission a node
Add an HDP service to a cluster using Ambari

Configuration

Troubleshooting 

Restart an HDP service

View an application’s log file

Configure and manage alerts

Troubleshoot a failed job

High Availability

Configure NameNode HA

Configure ResourceManager HA

Copy data between two clusters using distcp

Create a snapshot of an HDFS directory

Recover a snapshot

Configure HiveServer2 HA

Security

Install and configure Knox

Install and configure Ranger

Configure HDFS ACLS

Configure Hadoop for Kerberos

 

 

Reference

 Hortonworks http://hortonworks.com/training/class/hdp-certified-administrator-hdpca-exam/

 

HDFS Snapshots

Make your HDFS directory snapshotable , in our case test4

[gpadmin@sphdmst01 tmp]$ hdfs dfsadmin -allowSnapshot /test4
Allowing snaphot on /test4 succeeded

[gpadmin@sphdmst01 tmp]$ hdfs dfsadmin -disallowSnapshot /test4
Disallowing snaphot on /test4 succeeded

Create a snapshot

[gpadmin@sphdmst01 ]$ hdfs dfs -createSnapshot /test4 first-snapshot
Created snapshot /test4/.snapshot/first-snapshot

[gpadmin@sphdmst01 tmp]$ hdfs dfs -ls -R /test4/.snapshot
drwxr-xr-x   – gpadmin hadoop          0 2015-08-11 15:28 /test4/.snapshot/first-snapshot
-rw-r–r–   2 gpadmin hadoop      14515 2015-01-12 10:05 /test4/.snapshot/first-snapshot/Hadoop Servers.xlsx
-rw-r–r–   2 gpadmin hadoop          0 2015-01-12 10:04 /test4/.snapshot/first-snapshot/Hadoop_prod.xlsx
-rw-r–r–   2 gpadmin hadoop       4322 2015-01-12 10:08 /test4/.snapshot/first-snapshot/check_hadoop-dfs.sh
-rw-r–r–   2 gpadmin hadoop          0 2015-01-12 10:04 /test4/.snapshot/first-snapshot/pgadmin.log

You can read the content of the file

[gpadmin@sphdmst01 tmp]$ hdfs dfs -cat /test4/.snapshot/first-snapshot/check_hadoop-dfs.sh

Recover the file from the snapshot

[gpadmin@sphdmst01 /]$ hdfs dfs -cp /test4/.snapshot/first-snapshot/check_hadoop-dfs.sh /ovitest

[gpadmin@cmtolsphdmst01 /]$ hdfs dfs -ls /ovitest
Found 6 items
-rw-r–r–   2 gpadmin hadoop       4322 2015-08-11 15:37 /ovitest/check_hadoop-dfs.sh
-rw-r–r–   2 gpadmin hadoop         66 2015-01-13 16:31 /ovitest/test.txt
-rw-r–r–   2 gpadmin hadoop         66 2015-01-13 17:09 /ovitest/test2.txt
-rw-r–r–   2 gpadmin hadoop         66 2015-01-13 17:10 /ovitest/test3.txt
-rw-r–r–   2 gpadmin hadoop         66 2015-01-14 10:52 /ovitest/test4.txt
-rw-r–r–   2 gpadmin hadoop         66 2015-01-14 10:53 /ovitest/test5.txt

Another example :

gpadmin@sphdmst01 ~]$ hdfs dfs -mkdir /test_snapshot

[gpadmin@sphdmst01 ~]$ hdfs dfs -put dfs-old-lsr-1.log /test_snapshot
[gpadmin@sphdmst01 ~]$ hdfs dfs -put dfs-old-fsck-1.log /test_snapshot

[gpadmin@sphdmst01 ~]$ hdfs dfs -ls /test_snapshot
Found 2 items
-rw-r–r–   2 gpadmin hadoop      45341 2015-09-14 09:39 /test_snapshot/dfs-old-fsck-1.log
-rw-r–r–   2 gpadmin hadoop      83862 2015-09-14 09:38 /test_snapshot/dfs-old-lsr-1.log

[gpadmin@sphdmst01 ~]$ hdfs dfs -createSnapshot /test_snapshot snapshot_dir
createSnapshot: Directory is not a snapshottable directory: /test_snapshot

[gpadmin@sphdmst01 ~]$ hdfs dfsadmin  -allowSnapshot /test_snapshot
Allowing snaphot on /test_snapshot succeeded

[gpadmin@sphdmst01 ~]$ hdfs dfs -createSnapshot /test_snapshot snapshot_dir
Created snapshot /test_snapshot/.snapshot/snapshot_dir

Snapshot is read-only, HDFS will protect against user or application deletion of the snapshot

Creating a Hadoop archive – the small files problem

The HDSF is designed to store and process large data sets ( terabytes).  Storing a large number of small files in HDFS is inefficient.

Hadoop Archives (HAR) can be used to address the namespace limitations associated with storing many small files. Whit HAR we can packs a number of small files into large files so that the original files can be accessed transparently .

You can use following command to create a Hadoop archive:

hadoop archive -archiveName name -p  *

Example :

[gpadmin@]$ hadoop archive -archiveName ovi.har -p /user/ovidiu  /user/ovi

[gpadmin@sphdmst02 ~]$ hadoop fs -ls /user/ovi
Found 1 items
drwxr-xr-x   – gpadmin hadoop          0 2015-07-23 16:20 /user/ovi/ovi.har
[gpadmin@]$ hadoop archive -archiveName ovi.har -p /user/ovidiu  /user/ovi

 

Following example create creates an archive using /user/ovidiu as the relative archive directory.

The directories

/user/ovidiu/SIT1

/user/ovidiu/SIT2

/user/ovidiu/SIT3

will be archived in the /user/ovi/ovi2.har archive

$ hadoop archive -archiveName ovi2.har -p /user/ovidiu/ SIT1 SIT2 SIT3 /user/ovi

[gpadmin@cmtolsphdmst02 ~]$ hadoop fs -ls /user/ovi
Found 2 items
drwxr-xr-x   – gpadmin hadoop          0 2015-07-23 16:20 /user/ovi/ovi.har
drwxr-xr-x   – gpadmin hadoop          0 2015-07-24 11:59 /user/ovi/ovi2.har

 Looking up file in hadoop archives

To a client using the HAR filesystem nothing has changed:  the original files are  accessible and visible (albeit using a har:// URL)

[gpadmin@sphdmst02 ~]$ hdfs dfs -ls har:///user/ovi/ovi.har/
Found 3 items
-rw-r–r–   2 gpadmin hadoop        125 2015-07-23 16:19 har:///user/ovi/ovi.har/ranking.txt
-rw-r–r–   2 gpadmin hadoop         66 2015-01-14 16:06 har:///user/ovi/ovi.har/test.txt
-rw-r–r–   2 gpadmin hadoop         13 2015-07-23 16:18 har:///user/ovi/ovi.har/test2.txt

 

[gpadmin@sphdmst02 ~]$  hdfs dfs -ls har:///user/ovi/ovi2.har/
Found 3 items
drwxr-xr-x   – gpadmin hadoop          0 2015-07-24 11:54 har:///user/ovi/ovi2.har/SIT1
drwxr-xr-x   – gpadmin hadoop          0 2015-07-24 11:55 har:///user/ovi/ovi2.har/SIT2
drwxr-xr-x   – gpadmin hadoop          0 2015-07-24 11:55 har:///user/ovi/ovi2.har/SIT3
[gpadmin@cmtolsphdmst02 ~]$  hdfs dfs -ls har:///user/ovi/ovi2.har/SIT1
Found 2 items
-rw-r–r–   2 gpadmin hadoop        125 2015-07-24 11:52 har:///user/ovi/ovi2.har/SIT1/ranking.txt
-rw-r–r–   2 gpadmin hadoop         13 2015-07-24 11:54 har:///user/ovi/ovi2.har/SIT1/test2.txt

 

Rack awareness

Hadoop divides the data into multiple file blocks and stores them on different machines. If Rack Awareness is not configured, there may be a possibility that hadoop will place all the copies of the block in same rack which results in loss of data when that rack fails

Below are steps to configure rack awareness policy – ( manually  )

** stop the cluster

** Copy those 2 files rack_topology.sh  ( rack topology script ) and topology.data to the /etc/gphd/hadoop/conf directory on all cluster NameNodes (phdmst01 and phdmst02 )

** Add the following property to core-site.xml:

<property>

<name>net.topology.script.file.name</name>

<value>/etc/gphd/hadoop/conf/rack_topology.sh</value>

</property>

 

[root@phdmst01 conf]# pwd

/etc/gphd/hadoop/conf

Rack topology script

[root@phdmst01 conf]# more rack_topology.sh

HADOOP_CONF=/etc/gphd/hadoop/conf

while [ $# -gt 0 ] ; do

nodeArg=$1

exec< ${HADOOP_CONF}/topology.data

result=””

while read line ; do

ar=( $line )

if [ “${ar[0]}” = “$nodeArg” ] ; then

result=”${ar[1]}”

fi

done

shift

if [ -z “$result” ] ; then

echo -n “/default/rack ”

else

echo -n “$result ”

fi

done

[root@phdmst01 conf]# more topology.data

192.168.129.56           /bcc/rack1

192.268.129.57           /bcc/rack1

192.168.129.58           /bcc/rack1

192.168.129.59           /bcc/rack2

192.168.129.60           /bcc/rack2

192.168.129.61           /bcc/rack2

Verify Rack Awareness

The hadoop dfsamin -printTopology command will show the topology

-bash-4.1$ hdfs dfsadmin -printTopology

Rack: /bcc/rack1

192.168.129.56:50010 (phddna01.mydev.com)

192.168.129.57:50010 (phddna02.mydev.com)

192.168.129.58:50010 (phddna03.mydev.com)

Rack: /bcc/rack2

192.168.129.59:50010 (phddnb01.mydev.com)

192.168.129.60:50010 (phddnb02.mydev.com)

192.168.129.61:50010 (phddnb03.mydev.com)

Also you can test with following commands:

– Hadoop fsck command

– dfsadmin -report

2) Configure rack awarness with ambari

Setting up the HDFS NFS Gateway

The NFS Gateway supports NFSv3 and allows HDFS to be mounted as part of the client’s local file system

Set up NFS Gateway to access HDFS data

Install hdfs nfs packages

 

#yum install hadoop-hdfs-nfs3.x86_64

#yum install hadoop-hdfs-portmap

To start the portmap and NFS gateway daemon:

Run either:

$ sudo service hadoop-hdfs-portmap start

$ sudo service hadoop-hdfs-nfs3 start

 

or

 

$ sudo /etc/init.d/hadoop-hdfs-portmap start

$ sudo /etc/init.d/hadoop-hdfs-nfs3 start

 

Verify validity of NFS related services

[root@phdmst04 ~]# rpcinfo -p phdmst04

program vers proto   port service

100005   1   tcp   4242 mountd

100000   2   udp   111 portmapper

100005    3   tcp   4242 mountd

100005   2   udp   4242 mountd

100003   3   tcp   2049 nfs

100000   2   tcp   111 portmapper

100005   3   udp   4242 mountd

100005   1   udp   4242 mountd

100005   2   tcp   4242 mountd

 

[root@phdmst04 ~]# showmount -e phdmst04

Export list for phdmst04:

/ 192.168.129.55/255.255.255.0

#mount -t nfs -o vers=3,proto=tcp,nolock,noatime phdmst04:/ /data/hdfs_mnt

[root@phdmst04 ~]# df -h

Filesystem           Size Used Avail Use% Mounted on

/dev/mapper/vg_cmri-lv_root

202G   11G 182G   6% /

tmpfs                  95G     0   95G   0% /dev/shm

/dev/sda1             485M   66M 394M 15% /boot

/dev/mapper/vg_cmri-lv_home

9.9G 2.1G 7.3G 23% /home

/dev/mapper/vg_data-lv_data

493G 243M 467G   1% /data

phdmst04:/     323T 3.3T 320T   2% /data/hdfs_mnt

Troubleshooting

check nfs3 and portmap status

[root@blpphdmst04 init.d]# ./hadoop-hdfs-portmap status
portmap is stopped
[root@blpphdmst04 init.d]# ./hadoop-hdfs-portmap start
starting portmap, logging to /var/log/gphd/hadoop-hdfs/hadoop-hdfs-portmap-blpphdmst04.mydev.com.out
DEPRECATED: Use of this script to execute hdfs command is deprecated.
Instead use the hdfs command for it.

[  OK  ]

[root@blpphdmst04 init.d]# ./hadoop-hdfs-nfs3 status
nfs3 is stopped

[root@blpphdmst04 init.d]# ./hadoop-hdfs-nfs3 start
starting nfs3, logging to /var/log/gphd/hadoop-hdfs/hadoop-hdfs-nfs3-blpphdmst04.mydev.com.out
DEPRECATED: Use of this script to execute hdfs command is deprecated.
Instead use the hdfs command for it.

[  OK  ]
[root@blpphdmst04 init.d]# mount -t nfs -o vers=3,proto=tcp,nolock,noatime blpphdmst04:/ /data/hdfs_mnt
[root@blpphdmst04 init.d]# df -h
Filesystem            Size  Used Avail Use% Mounted on
/dev/mapper/vg_cmri-lv_root
202G   16G  176G   9% /
tmpfs                  95G     0   95G   0% /dev/shm
/dev/sda1             477M   89M  363M  20% /boot
/dev/mapper/vg_cmri-lv_home
9.8G  2.0G  7.3G  22% /home
/dev/mapper/vg_data-lv_data
493G  117M  467G   1% /data
blpphdmst04:/      269T  2.9T  266T   2% /data/hdfs_mnt