Donghua's Blog - DBAGlobe: 2018

Monday, December 31, 2018

Demo code to show case Solr ingestion

# Written for Solr 7 (shipped with CDH6) and Python3

import tika
import json
import urllib3
import traceback
import os


tika.initVM()
from tika import parser
url = 'http://node02.dbaglobe.com:8983/solr/cms/update/json/docs?commit=true'
filelist = ['D:\\Temp\\Building Positive Relationships young children.pdf',
            'D:\\Temp\\Building Positive Relationships spouse n in laws.pdf']

http = urllib3.PoolManager()

for file in filelist:
    try:
        parsed = parser.from_file(file)
        #Add content to "combined" dict object        combined={}
        combined['id']=os.path.basename(file) # use file name as Doc ID        combined.update(parsed["metadata"])
        combined['content']=parsed["content"]
        combined_json = json.loads(json.dumps(combined))

        print(combined_json)

        # to clean up, execute solr command *:*        # use immutable to avoid error "This ConfigSet is immutable.", use below to create the template before create the collection        # http://node02:8983/solr/admin/configs?action=CREATE&name=myConfigSet&baseConfigSet=schemalessTemplate&configSetProp.immutable=false&wt=xml        # to search: content:"Psychologist"
        response = http.request('POST',url,body=json.dumps(combined_json),headers={'Content-Type': 'application/json'})
        print (response.data)
    except:
        print(traceback.format_exc())

Sunday, December 16, 2018

Sentry and Hive permission explained

When using Sentry, the impersonation feature of HiveServer2 is disabled and each query runs in the cluster as the configured Hive principal. Thus, each HDFS location associated with a Hive table should be readable and writable by the Hive user or group.

If you are using the HDFS ACL synchronization feature, the required HDFS permissions (r-x for SELECT, -wx for INSERT, and rwx for ALL) on files are enforced automatically and maintained dynamically in response to changes in privilege grants on databases and tables. In our example, the alice user would be given r-x permission to files in tables in the sales database. Note that a grant on a URI object does not result in corresponding permissions on the location in HDFS.

Saturday, October 13, 2018

Tweet Read Example with PySpark streaming analytics

===================TweetsRead.py===================

import tweepy
from tweepy import Stream
from tweepy.auth import OAuthHandler
from tweepy.streaming import StreamListener
import socket
import json

consumer_key = ''consumer_secret = ''access_token = ''access_secret = ''

class TweetsListener(StreamListener):

    def __init__(self, csocket):
        self.client_socket = csocket

    def on_data(self, data):
        try:
            msg = json.loads(data)
            print(msg['text'].encode('utf-8'))
            self.client_socket.send(msg['text'].encode('utf-8'))
            return True        except BaseException as e:
            print("Error on data: %s" % str(e))
        return True
    def on_error(self, status):
        print(status)
        return True

def sendData(c_socket):
    auth = OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_secret)

    twitter_stream = Stream(auth, TweetsListener(c_socket))
    twitter_stream.filter(track=['china'])


if __name__ == "__main__":
    s = socket.socket()  # create socket object    host = "127.0.0.1"    port = 5555    s.bind((host, port))
    print("Listening on port: %s" % str(port))

    s.listen(5)
    c, addr = s.accept()
    print("Received request from: " + str(addr))

    sendData(c)

===================TwitterAnalytics.py===================

import findspark

findspark.init('/Users/donghua/spark-2.3.2-bin-hadoop2.7')

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import desc

# Not required if running in Pyspark integrated notebooksc = SparkContext()

ssc = StreamingContext(sc, 10)
sqlContext = SQLContext(sc)

socket_stream = ssc.socketTextStream("127.0.0.1",5555)


lines = socket_stream.window(20)

from collections import namedtuple
fields = ("tag","count")
Tweet = namedtuple('Tweet', fields)

# use () for multiple lines(lines.flatMap(lambda text: text.split(" "))
.filter(lambda word: word.startswith("#"))
.map(lambda word: (word.lower(), 1))
.reduceByKey(lambda a, b : a + b)
.map(lambda rec: Tweet(rec[0],rec[1]))
.foreachRDD(lambda rdd: rdd.toDF().sort(desc("count"))
.limit(10).registerTempTable("tweets")))


import time
from IPython import display
import matplotlib.pyplot as plt
import seaborn as sns
import pandas
# get_ipython().run_line_magic('matplotlib', 'inline')

ssc.start()

count = 0while count < 10:
    time.sleep(10)
    top_10_tweets = sqlContext.sql("select tag,count from tweets")
    top_10_df = top_10_tweets.toPandas()
    display.clear_output(wait = True)
    plt.figure(figsize= (10, 8))
    sns.barplot(x="count", y="tag",data=top_10_df)
    plt.show()
    count = count+1
ssc.stop()

Wednesday, October 10, 2018

Use Jupyter Notebook with Spark2 on Apache Spark on MacOS

Reference url: http://www.dbaglobe.com/2018/07/use-jupyter-notebook-with-spark2-on.html

Below are changes specific for apache spark, anaconda3 on MacOS

mkdir /Users/donghua/anaconda3/share/jupyter/kernels/pyspark2/

[root@cdh-vm bin]# cat /Users/donghua/anaconda3/share/jupyter/kernels/pyspark2/kernel.json
{
"argv": [
"python3.6",
"-m",
"ipykernel_launcher",
"-f",
"{connection_file}"
],
"display_name": "Python3.6 + Pyspark(Spark 2.3.2)",
"language": "python",
"env": {
"PYSPARK_PYTHON": "python",
"SPARK_HOME": "/Users/donghua/spark-2.3.2-bin-hadoop2.7",
"SPARK_CONF_DIR": "/Users/donghua/spark-2.3.2-bin-hadoop2.7/conf",
"PYTHONPATH": "/Users/donghua/spark-2.3.2-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip:/Users/donghua/spark-2.3.2-bin-hadoop2.7/python/:",
"PYTHONSTARTUP": "/Users/donghua/spark-2.3.2-bin-hadoop2.7/python/pyspark/shell.py",
"PYSPARK_SUBMIT_ARGS": "--master spark://Donghuas-MacBook-Air.local:7077 --name PySparkShell pyspark-shell"
}
}

/Users/donghua/anaconda3//bin/jupyter-notebook --ip=Donghuas-MacBook-Air.local --port 9999

#export PYSPARK_DRIVER_PYTHON=ipython
#export SPARK_HOME=/Users/donghua/spark-2.3.2-bin-hadoop2.7
#export PATH=$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH

Tuesday, October 9, 2018

Modify default slaves.sh to start apache-spark on MacOS

By default, Apache Spark sbin/start-all.sh will try to start worker using "ssh" to slave node, regardless we were testing using our laptop. Below modification is to start it using Bash instead of SSH.

Donghuas-MacBook-Air:sbin donghua$ diff slaves.sh slaves.sh.old

92,94c92,93

< cmd="${@// /\\ } 2>&1"

< echo $cmd

< bash -c "$cmd"

---

> ssh $SPARK_SSH_OPTS "$slave" $"${@// /\\ }" \

> 2>&1 | sed "s/^/$slave: /"

96,98c95,96

< cmd="${@// /\\ } 2>&1"

< echo $cmd

< bash -c "$cmd"

---

> ssh $SPARK_SSH_OPTS "$slave" $"${@// /\\ }" \

> 2>&1 | sed "s/^/$slave: /" &

Revised code could be found here:

https://github.com/luodonghua/bigdata/blob/master/slaves.sh

And modified version of spark-conf.sh to start multiple workers could be found here:

https://github.com/luodonghua/bigdata/blob/master/spark-config.sh

Sunday, September 30, 2018

Cloudera CDH "Host Clock Offset" explained

https://www.cloudera.com/documentation/enterprise/5-8-x/topics/cm_ht_host.html#concept_pnm_cmn_yk

This is a host health test that checks if the host's system clock appears to be out-of-sync with its NTP server(s).

The test uses the 'ntpdc -np' (if ntpd is running) or 'chronyc sources' (if chronyd is running) command to check that the host is synchronized to an NTP peer and that the absolute value of the host's clock offset from that peer is not too large.

If the command fails, NTP is not synchronized to a server, or the host's NTP daemon is not running or cannot be contacted, the test will return "Bad" health. The 'ntpdc -np' or 'chronyc sources' output contains a row for each of the host's NTP servers. The row starting with a '*' (if ntpdc) or '^*' (if chronyc) contains the peer to which the host is currently synchronized. No row starting with a '*' or '^*' indicates that the host is not currently synchronized.

Communication errors and too large an offset between the peer and the host time are examples of conditions that can lead to a host being unsynchronized. Make sure that UDP port 123 is open in any firewall that is in use. Check the system log for ntpd or chronyd messages related to configuration errors.

If running ntpd, use 'ntpdc -c iostat' to verify that packets are sent and recieved between the different peers. More information about the conditions of each peer can be found by running the command 'ntpq -c as'. The output of this command includes the association ID that can be used in combination with 'ntpq -c "rv "' to get more information about the status of each such peer. The command 'ntpq -c pe' can also be used to return a summary of all peers and the reason why they are not in use.

If running chronyd, use 'chronyc activity' to check how many NTP sources are online/offline. More information about the conditions of each peer can be found by running the command 'chronyc sourcestats'. To check chrony tracking, issue the command 'chronyc tracking'.

If NTP is not in use on the host, this check should be disabled for the host using the configuration options shown below. Cloudera recommends using NTP for time synchronization of Hadoop clusters. A failure of this health test can indicate a problem with the host's NTP service or configuration. This test can be configured using the Host Clock Offset Thresholds host configuration setting.

Saturday, September 8, 2018

Regular Expression URLs useful during NiFi development

https://www.debuggex.com/

https://regexr.com/

https://regex101.com/

http://qfsm.sourceforge.net/download.html

NiFi using Java Regular Expression Syntax
https://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html

Friday, August 31, 2018

Clean up hanging/stuck ambari background operations manually

login ambari database and find out tasking ID required to clean up:

select task_id,role,role_command from host_role_command where status='IN_PROGRESS';
select task_id,role,role_command from host_role_command where status='QUEUED';
select task_id,role,role_command from host_role_command where status='PENDING';

update identified task_id to aborted status:

update host_role_command set status='ABORTED' where task_id in (264,266);

Wednesday, August 22, 2018

How to enable support for TLS 1.2 in Windows 7 and Windows 8.1

Starting July 1, 2018, support is being removed for TLS 1.0 and 1.1 from Azure VPN Gateway. VPN Gateway will support only TLS 1.2. To maintain support, see the updates to enable support for TLS1.2.

How to enable support for TLS 1.2 in Windows 7 and Windows 8.1:

Open a command prompt with elevated priveleges by right-clicking on Command Prompt and selecting Run as administrator.

Run the following commands in the command prompt:

reg add HKLM\SYSTEM\CurrentControlSet\Services\RasMan\PPP\EAP\13 /v TlsVersion /t REG_DWORD /d 0xfc0
reg add "HKLM\SOFTWARE\Microsoft\Windows\CurrentVersion\Internet Settings\WinHttp" /v DefaultSecureProtocols /t REG_DWORD /d 0xaa0
if %PROCESSOR_ARCHITECTURE% EQU AMD64 reg add "HKLM\SOFTWARE\Wow6432Node\Microsoft\Windows\CurrentVersion\Internet Settings\WinHttp" /v DefaultSecureProtocols /t REG_DWORD /d 0xaa0

~~Install the following updates:~~

~~KB3140245~~
~~KB2977292~~
~~Reboot the computer.~~

Reference: https://docs.microsoft.com/en-us/azure/vpn-gateway/point-to-site-about

Tuesday, August 21, 2018

Install banana dashboard in Cloudera (using CDH6 beta as example)

Step 1: download banana from github
[root@cdh60b ~]# wget https://github.com/lucidworks/banana/archive/release.zip

Step 2: unzip the release file into "/opt/cloudera/parcels/CDH/lib/solr/server/solr-webapp/webapp/"
[root@cdh60b ~]# unzip release.zip

[root@cdh60b ~]# mv banana-release /opt/cloudera/parcels/CDH/lib/solr/server/solr-webapp/webapp/banana

[root@cdh60b webapp]# ls -l /opt/cloudera/parcels/CDH/lib/solr/server/solr-webapp/webapp/banana/
total 72
-rw-r--r-- 1 root root 665 Jun 4 2017 bower.json
-rw-r--r-- 1 root root 2669 Jun 4 2017 build.xml
-rw-r--r-- 1 root root 2464 Jun 4 2017 CONTRIBUTING.md
-rw-r--r-- 1 root root 262 Jun 4 2017 default.properties
-rw-r--r-- 1 root root 8478 Jun 4 2017 Gruntfile.js
-rw-r--r-- 1 root root 1531 Jun 4 2017 index.html
drwxr-xr-x 2 root root 31 Jun 4 2017 jetty-contexts
-rw-r--r-- 1 root root 610 Jun 4 2017 LICENSE.md
-rw-r--r-- 1 root root 2169 Jun 4 2017 mvn.template
-rw-r--r-- 1 root root 6990 Jun 4 2017 NOTICE.txt
-rw-r--r-- 1 root root 2176 Jun 4 2017 package.json
-rw-r--r-- 1 root root 3369 Jun 4 2017 pom.xml
-rw-r--r-- 1 root root 131 Jun 4 2017 QUICKSTART
-rw-r--r-- 1 root root 9969 Jun 4 2017 README.md
drwxr-xr-x 7 root root 134 Jun 4 2017 resources
drwxr-xr-x 8 root root 107 Jun 4 2017 src
drwxr-xr-x 4 root root 116 Jun 4 2017 test

Step 3, restart SOLR service using Cloudera manager.

http://cdh60b.dbaglobe.com:8983/solr/banana/src/index.html#/dashboard

If you want to save and load dashboards from Solr, then you need to create a collection called banana-int first. For Solr 6, here are the steps:

[donghua@cdh60b ~]$ cd /opt/cloudera/parcels/CDH/lib/solr/bin
[donghua@cdh60b bin]$ ls
init.d oom_solr.sh sentryMigrationTool solr.cmd solr.in.cmd zksynctool.sh
install_solr_service.sh post snapshotscli.sh solrctl.sh solr.in.sh
log4j.properties sentrycli.sh solr solrd zkcli.sh
[donghua@cdh60b bin]$ ./solr create -c banana-int
WARNING: Using _default configset. Data driven schema functionality is enabled by default, which is
NOT RECOMMENDED for production use.

To turn it off:
curl http://localhost:8983/solr/banana-int/config -d '{"set-user-property": {"update.autoCreateFields":"false"}}'

Connecting to ZooKeeper at cdh60b.dbaglobe.com:2181/solr ...
INFO - 2018-08-21 09:48:58.220; org.apache.solr.client.solrj.impl.ZkClientClusterStateProvider; Cluster at cdh60b.dbaglobe.com:2181/solr ready
Uploading /opt/cloudera/parcels/CDH/lib/solr/server/solr/configsets/_default/conf for config banana-int to ZooKeeper at cdh60b.dbaglobe.com:2181/solr

Creating new collection 'banana-int' using command:
http://localhost:8983/solr/admin/collections?action=CREATE&name=banana-int&numShards=1&replicationFactor=1&maxShardsPerNode=-1&collection.configName=banana-int

{
"responseHeader":{
"status":0,
"QTime":3084},
"success":{"cdh60b:8983_solr":{
"responseHeader":{
"status":0,
"QTime":1651},
"core":"banana-int_shard1_replica_n1"}}}

Sunday, August 19, 2018

Two methods to create keytab for secure cluster

Method 1: (alternative use kadmin, but not possible with -norandkey option)

[root@cdh60 flume-ng]# kadmin.local
Authenticating as principal root/admin@DBAGLOBE.COM with password.
kadmin.local: ktadd -k dsuser.keytab -norandkey dsuser@DBAGLOBE.COM
Entry for principal dsuser@DBAGLOBE.COM with kvno 3, encryption type aes256-cts-hmac-sha1-96 added to keytab WRFILE:dsuser.keytab.
Entry for principal dsuser@DBAGLOBE.COM with kvno 3, encryption type aes128-cts-hmac-sha1-96 added to keytab WRFILE:dsuser.keytab.
Entry for principal dsuser@DBAGLOBE.COM with kvno 3, encryption type des3-cbc-sha1 added to keytab WRFILE:dsuser.keytab.
Entry for principal dsuser@DBAGLOBE.COM with kvno 3, encryption type arcfour-hmac added to keytab WRFILE:dsuser.keytab.
Entry for principal dsuser@DBAGLOBE.COM with kvno 3, encryption type camellia256-cts-cmac added to keytab WRFILE:dsuser.keytab.
Entry for principal dsuser@DBAGLOBE.COM with kvno 3, encryption type camellia128-cts-cmac added to keytab WRFILE:dsuser.keytab.
Entry for principal dsuser@DBAGLOBE.COM with kvno 3, encryption type des-hmac-sha1 added to keytab WRFILE:dsuser.keytab.
Entry for principal dsuser@DBAGLOBE.COM with kvno 3, encryption type des-cbc-md5 added to keytab WRFILE:dsuser.keytab.
kadmin.local: quit

Method 2: works for both MITKDC and AD

[dsuser@cdh60 ~]$ ktutil
ktutil: addent -password -p dsuser@DBAGLOBE.COM -k 3 -e aes256-cts-hmac-sha1-96
Password for dsuser@DBAGLOBE.COM:
ktutil: wkt /home/dsuser/dsuser.keytab
ktutil: q

Configure HDFS NFS Gateway

Configure HDFS NFS Gateway

[root@cdh60 ~]# showmount -e cdh60
Export list for cdh60:
/ *

[root@cdh60 ~]# rpcinfo cdh60
program version netid address service owner
100000 4 tcp6 ::.0.111 portmapper superuser
100000 3 tcp6 ::.0.111 portmapper superuser
100000 4 udp6 ::.0.111 portmapper superuser
100000 3 udp6 ::.0.111 portmapper superuser
100000 4 tcp 0.0.0.0.0.111 portmapper superuser
100000 3 tcp 0.0.0.0.0.111 portmapper superuser
100000 2 tcp 0.0.0.0.0.111 portmapper superuser
100000 4 udp 0.0.0.0.0.111 portmapper superuser
100000 3 udp 0.0.0.0.0.111 portmapper superuser
100000 2 udp 0.0.0.0.0.111 portmapper superuser
100000 4 local /var/run/rpcbind.sock portmapper superuser
100000 3 local /var/run/rpcbind.sock portmapper superuser
100005 1 udp 0.0.0.0.16.146 mountd superuser
100005 2 udp 0.0.0.0.16.146 mountd superuser
100005 3 udp 0.0.0.0.16.146 mountd superuser
100005 1 tcp 0.0.0.0.16.146 mountd superuser
100005 2 tcp 0.0.0.0.16.146 mountd superuser
100005 3 tcp 0.0.0.0.16.146 mountd superuser
100003 3 tcp 0.0.0.0.8.1 nfs superuser

[root@cdh60 ~]# mkdir /hdfs_nfs_mount

root@cdh60 ~]# mount -t nfs -o vers=3,proto=tcp,nolock cdh60:/ /hdfs_nfs_mount
[root@cdh60 ~]# df -h /hdfs_nfs_mount
Filesystem Size Used Avail Use% Mounted on
cdh60:/ 69G 6.5G 63G 10% /hdfs_nfs_mount

Linux Permission Applies:

[root@cdh60 dsuser]# ls -ld /hdfs_nfs_mount/data/incoming/
drwxr-xr-x 3 dsuser 2584148964 96 Aug 19 12:17 /hdfs_nfs_mount/data/incoming/

[root@cdh60 dsuser]# cp employees.csv /hdfs_nfs_mount/data/incoming/
cp: cannot create regular file ‘/hdfs_nfs_mount/data/incoming/employees.csv’: Permission denied

Login as dsuser (owner of incoming folder)

Donghuas-MacBook-Air:pandas donghua$ ssh dsuser@cdh60
dsuser@cdh60's password:
Last login: Sun Aug 19 09:24:50 2018 from 192.168.1.1

[dsuser@cdh60 ~]$ klist
Ticket cache: FILE:/tmp/krb5cc_1001
Default principal: dsuser@DBAGLOBE.COM

Valid starting Expires Service principal
08/19/2018 09:24:54 08/20/2018 09:24:54 krbtgt/DBAGLOBE.COM@DBAGLOBE.COM
renew until 08/26/2018 09:24:54

[dsuser@cdh60 ~]$ cp employees.csv /hdfs_nfs_mount/data/incoming/

[dsuser@cdh60 ~]$ ls -l /hdfs_nfs_mount/data/incoming/employees.csv
-rw-r--r-- 1 dsuser 2584148964 59175 Aug 19 12:17 /hdfs_nfs_mount/data/incoming/employees.csv

[dsuser@cdh60 ~]$ hdfs dfs -ls /data/incoming/
Found 1 items
-rw-r--r-- 1 dsuser supergroup 59175 2018-08-19 12:17 /data/incoming/employees.csv

[dsuser@cdh60 ~]$ hdfs dfs -cat /data/incoming/employees.csv|head -n 3
First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
Douglas,Male,8/6/1993,12:42 PM,97308,6.945,true,Marketing
Thomas,Male,3/31/1996,6:53 AM,61933,4.17,true,

[dsuser@cdh60 ~]$ cat /hdfs_nfs_mount/data/incoming/employees.csv |head -n3
First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
Douglas,Male,8/6/1993,12:42 PM,97308,6.945,true,Marketing
Thomas,Male,3/31/1996,6:53 AM,61933,4.17,true,

Login as root (act as normal nfs client user)

[root@cdh60 dsuser]# hdfs dfs -cat /data/incoming/employees.csv|head -n 3
18/08/19 12:26:56 WARN ipc.Client: Exception encountered while connecting to the server : org.apache.hadoop.security.AccessControlException: Client cannot authenticate via:[TOKEN, KERBEROS]
cat: Failed on local exception: java.io.IOException: org.apache.hadoop.security.AccessControlException: Client cannot authenticate via:[TOKEN, KERBEROS]; Host Details : local host is: "cdh60.dbaglobe.com/192.168.56.110"; destination host is: "cdh60.dbaglobe.com":8020;

[root@cdh60 dsuser]# cat /hdfs_nfs_mount/data/incoming/employees.csv |head -n3
First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
Douglas,Male,8/6/1993,12:42 PM,97308,6.945,true,Marketing
Thomas,Male,3/31/1996,6:53 AM,61933,4.17,true,

Sunday, August 5, 2018

How to enable PAM Authentication in Zeppelin

Step 1: comment out preconfigured user

/usr/hdp/current/zeppelin-server/conf/shiro.ini

#[users]
# List of users with their password allowed to access Zeppelin.
# To use a different strategy (LDAP / Database / ...) check the shiro doc at http://shiro.apache.org/configuration.html#Configuration-INISections
#admin = $shiro1$SHA-256$500000$p6Be9+t2hdUXJQj2D0b1fg==$bea5JIMqcVF3J6eNZGWQ/3eeDByn5iEZDuGsEip06+M=, admin
#user1 = $shiro1$SHA-256$500000$G2ymy/qmuZnGY6or4v2KfA==$v9fabqWgCNCgechtOUqAQenGDs0OSLP28q2wolPT4wU=, role1, role2
#user2 = $shiro1$SHA-256$500000$aHBgiuwSgAcP3Xt5mEzeFw==$KosBnN2BNKA9/KHBL0hnU/woJFl+xzJFj12NQ0fnjCU=, role3
#user3 = $shiro1$SHA-256$500000$nf0GzH10GbYVoxa7DOlOSw==$ov/IA5W8mRWPwvAoBjNYxg3udJK0EmrVMvFCwcr9eAs=, role2

Step 2: enable PAM

/usr/hdp/current/zeppelin-server/conf/shiro.ini

### A sample PAM configuration
pamRealm=org.apache.zeppelin.realm.PamRealm
pamRealm.service=sshd

Step 3: Grant access to /etc/shadow

[root@hdp30 bin]# setfacl -m user:zeppelin:r /etc/shadow
[root@hdp30 bin]# getfacl /etc/shadow
getfacl: Removing leading '/' from absolute path names
# file: etc/shadow
# owner: root
# group: root
user::---
user:zeppelin:r--
group::---
mask::r--
other::---

Step 4: Restart Zeppelin and login using os user donghua

==> /var/log/zeppelin/zeppelin-zeppelin-hdp30.log <==
WARN [2018-08-05 20:04:43,756] ({qtp110992469-18} LoginRestApi.java[postLogin]:206) - {"status":"OK","message":"","body":{"principal":"donghua","ticket":"fa4817f4-c68a-4c0e-b61a-39295daa3062","roles":"[]"}}

Reference: https://community.hortonworks.com/content/supportkb/167636/how-to-enable-pam-authentication-in-zeppelin.html

Saturday, August 4, 2018

Set timezone in Redhat Linux 7

[root@hdp30 ~]# timedatectl list-timezones|grep -i singapore
Asia/Singapore

[root@hdp30 ~]# timedatectl
Local time: Sat 2018-08-04 00:56:47 UTC
Universal time: Sat 2018-08-04 00:56:47 UTC
RTC time: Sat 2018-08-04 00:56:47
Time zone: Etc/UTC (UTC, +0000)
NTP enabled: yes
NTP synchronized: yes
RTC in local TZ: no
DST active: n/a

[root@hdp30 ~]# timedatectl set-timezone Asia/Singapore

[root@hdp30 ~]# timedatectl
Local time: Sat 2018-08-04 08:58:21 +08
Universal time: Sat 2018-08-04 00:58:21 UTC
RTC time: Sat 2018-08-04 00:58:22
Time zone: Asia/Singapore (+08, +0800)
NTP enabled: yes
NTP synchronized: yes
RTC in local TZ: no
DST active: n/a

Sunday, July 29, 2018

Use Jupyter Notebook with Spark2 on Cloudera

Step 1, find out necessary envronment variables

[donghua@cdh-vm ~]$ cat getPythonEnv.sh
import os
print "SPARK_HOME: %s"%(os.environ['SPARK_HOME'])
print "HADOOP_CONF_DIR: %s"%(os.environ['HADOOP_CONF_DIR'])
print "SPARK_CONF_DIR: %s"%(os.environ['SPARK_CONF_DIR'])
print "PYTHONPATH: %s"%(os.environ['PYTHONPATH'])
print "PYTHONSTARTUP: %s"%(os.environ['PYTHONSTARTUP'])
print "PYSPARK_SUBMIT_ARGS: %s"%(os.environ['PYSPARK_SUBMIT_ARGS'])

[donghua@cdh-vm ~]$ pyspark2
SPARK_HOME: /opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2
HADOOP_CONF_DIR: /opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/conf/yarn-conf
SPARK_CONF_DIR: /opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/conf
PYTHONPATH: /opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/python/lib/py4j-0.10.6-src.zip:/opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/python/:
PYTHONSTARTUP: /opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/python/pyspark/shell.py
PYSPARK_SUBMIT_ARGS: "--name" "PySparkShell" "pyspark-shell"
[donghua@cdh-vm ~]$

Step 2: Prepare the kernel file, below is one workable example

mkdir /opt/anaconda2/share/jupyter/kernels/pyspark2/

[root@cdh-vm bin]# cat /opt/anaconda2/share/jupyter/kernels/pyspark2/kernel.json
{
"argv": [
"python2.7",
"-m",
"ipykernel_launcher",
"-f",
"{connection_file}"
],
"display_name": "Python2.7 + Pyspark(Spark 2.3.0)",
"language": "python",
"env": {
"PYSPARK_PYTHON": "/opt/anaconda2/bin/python2.7",
"SPARK_HOME": "/opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2",
"HADOOP_CONF_DIR": "/opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/conf/yarn-conf",
"SPARK_CONF_DIR": "/opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/conf",
"PYTHONPATH": "/opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/python/lib/py4j-0.10.6-src.zip:/opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/python/:",
"PYTHONSTARTUP": "/opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/python/pyspark/shell.py",
"PYSPARK_SUBMIT_ARGS": "--name 'Jupyter Notebook' --master yarn --deploy-mode client pyspark-shell"
}
}

Start the notebook:

[donghua@cdh-vm ~]$ /opt/anaconda2/bin/jupyter-notebook --ip=192.168.31.238 --port 9999

Sunday, July 8, 2018

Using MySQL 8 with Hue in Cloudera CDH 5.15

Symptoms: Using MySQL 8 with Hue in Cloudera CDH 5.15

Error during adding Hue:

Unable to connect to database on host 'cdh-vm.dbaglobe.com' from host 'cdh-vm.dbaglobe.com' using the credential provided.

Error in cloudera-scm-server.log
+ exec /opt/cloudera/parcels/CDH-5.15.0-1.cdh5.15.0.p0.21/lib/hue/build/env/bin/hue is_db_alive
[08/Jul/2018 19:30:15 +0000] settings DEBUG DESKTOP_DB_TEST_NAME SET: /opt/cloudera/parcels/CDH-5.15.0-1.cdh5.15.0.p0.21/lib/hue/desktop/desktop-test.db
[08/Jul/2018 19:30:15 +0000] settings DEBUG DESKTOP_DB_TEST_USER SET: hue_test
[08/Jul/2018 04:30:23 +0000] __init__ INFO Couldn't import snappy. Support for snappy compression disabled.
Error accessing DB: (2059, "Authentication plugin 'caching_sha2_password' cannot be loaded: /usr/lib64/mysql/plugin/caching_sha2_password.so: cannot open shared object file: No such file or directory")

How to fix:

alter user 'hue'@'%' IDENTIFIED WITH mysql_native_password BY 'my_complex_password';

Sunday, June 3, 2018

Two methods to modify HDFS custom metadata

Two methods to modify HDFS custom metadata with Cloudera Navigator

- Metadata file
not recommended for production use as lead to small file problems
update provided through metadata files are queued before merged

- Metadata API
use either metadata file or API, not both
API overwrites metadata, and take effects immediately

[donghua@cdh-vm data]$ hdfs dfs -ls /data/donghua/*drink*
-rw-r--r-- 1 donghua hive 145 2018-06-03 11:27 /data/donghua/.drinks.csv.navigator
-rw-r--r-- 1 donghua hive 5918 2018-06-03 00:07 /data/donghua/drinks.csv

[donghua@cdh-vm data]$ hdfs dfs -cat /data/donghua/.drinks.csv.navigator
{
"name":"drinks dataset"
"description": "metadata example using .drinks.csv.navigator"
"properties":{
"Dept":"myDept"
},
"tags":["external"]
}

curl -u admin:admin -X GET 'http://cdh01:7187/api/v13/entities/?query=originalName:imdb_1000.csv&limit=100&offset=0'
[donghua@cdh-vm data]$ curl -u admin:admin -X GET 'http://cdh-vm:7187/api/v13/entities/?query=originalName%3D%22imdb_1000.csv%22&limit=100&offset=0'
[ {
"originalName" : "imdb_1000.csv",
"originalDescription" : null,
"sourceId" : "5",
"firstClassParentId" : null,
"parentPath" : "/data/donghua",
"deleteTime" : 0,
"extractorRunId" : "5##20",
"customProperties" : null,
"name" : null,
"description" : null,
"tags" : null,
"properties" : {
"__cloudera_internal__hueLink" : "http://cdh-vm:8889/filebrowser/#/data/donghua/imdb_1000.csv"
},
"technicalProperties" : null,
"fileSystemPath" : "/data/donghua/imdb_1000.csv",
"type" : "FILE",
"size" : 91499,
"created" : "2018-06-03T00:07:55.434Z",
"lastModified" : "2018-06-03T00:07:55.434Z",
"lastAccessed" : "2018-06-03T00:07:54.880Z",
"permissions" : "rw-r--r--",
"owner" : "donghua",
"group" : "hive",
"blockSize" : 134217728,
"mimeType" : "application/octet-stream",
"ezkeyName" : null,
"replication" : 1,
"metaClassName" : "fselement",
"deleted" : false,
"packageName" : "nav",
"userEntity" : false,
"sourceType" : "HDFS",
"identity" : "20388",
"internalType" : "fselement"
}, {
"originalName" : "imdb_1000.csv",
"originalDescription" : null,
"sourceId" : "5",
"firstClassParentId" : null,
"parentPath" : "/user/hive/warehouse/testdb.db/imdb_1000",
"deleteTime" : 0,
"extractorRunId" : "5##22",
"customProperties" : null,
"name" : null,
"description" : null,
"tags" : null,
"properties" : {
"__cloudera_internal__hueLink" : "http://cdh-vm:8889/filebrowser/#/user/hive/warehouse/testdb.db/imdb_1000/imdb_1000.csv"
},
"technicalProperties" : null,
"fileSystemPath" : "/user/hive/warehouse/testdb.db/imdb_1000/imdb_1000.csv",
"type" : "FILE",
"size" : 91499,
"created" : "2018-06-03T01:06:12.920Z",
"lastModified" : "2018-06-03T01:06:12.920Z",
"lastAccessed" : "2018-06-03T01:06:12.920Z",
"permissions" : "rw-r--r--",
"owner" : "hive",
"group" : "hive",
"blockSize" : 134217728,
"mimeType" : "application/octet-stream",
"ezkeyName" : null,
"replication" : 1,
"metaClassName" : "fselement",
"deleted" : false,
"packageName" : "nav",
"userEntity" : false,
"sourceType" : "HDFS",
"identity" : "22303",
"internalType" : "fselement"
}, {
"originalName" : "imdb_1000.csv._COPYING_",
"originalDescription" : null,
"sourceId" : "5",
"firstClassParentId" : null,
"parentPath" : "/data/donghua",
"deleteTime" : 1527984475434,
"extractorRunId" : "5##20",
"customProperties" : null,
"name" : null,
"description" : null,
"tags" : null,
"properties" : null,
"technicalProperties" : null,
"fileSystemPath" : "/data/donghua/imdb_1000.csv._COPYING_",
"type" : "FILE",
"size" : 91499,
"created" : "2018-06-03T00:07:54.880Z",
"lastModified" : "2018-06-03T00:07:54.880Z",
"lastAccessed" : "2018-06-03T00:07:54.880Z",
"permissions" : "rw-r--r--",
"owner" : "donghua",
"group" : "hive",
"blockSize" : 134217728,
"mimeType" : "application/octet-stream",
"ezkeyName" : null,
"replication" : 1,
"metaClassName" : "fselement",
"deleted" : true,
"packageName" : "nav",
"userEntity" : false,
"sourceType" : "HDFS",
"identity" : "20386",
"internalType" : "fselement"
} ]

curl -u admin:admin -X POST 'http://cdh-vm:7187/api/v13/entities/?query=originalName%3D%22imdb_1000.csv%22&limit=100&offset=0' \
-H "Content-Type:application/json" -d \
'{
"sourceId":"5",
"originalName" : "imdb_1000.csv",
"parentPath" : "/data/donghua",
"name":"imdb dataset",
"description": "metadata example using API",
"properties":{
"Dept":"myDept"
},
"tags":["external"]
}'

[donghua@cdh-vm data]$ curl -u admin:admin -X POST 'http://cdh-vm:7187/api/v13/entities/?query=originalName%3D%22imdb_1000.csv%22&limit=100&offset=0' \
> -H "Content-Type:application/json" -d \
> '{
> "sourceId":"5",
> "originalName" : "imdb_1000.csv",
> "parentPath" : "/data/donghua",
> "name":"imdb dataset",
> "description": "metadata example using API",
> "properties":{
> "Dept":"myDept"
> },
> "tags":["external"]
> }'
{
"originalName" : "imdb_1000.csv",
"originalDescription" : null,
"sourceId" : "5",
"firstClassParentId" : null,
"parentPath" : "/data/donghua",
"deleteTime" : 0,
"extractorRunId" : "5##20",
"customProperties" : null,
"name" : "imdb dataset",
"description" : "metadata example using API",
"tags" : [ "external" ],
"properties" : {
"Dept" : "myDept",
"__cloudera_internal__hueLink" : "http://cdh-vm:8889/filebrowser/#/data/donghua/imdb_1000.csv"
},
"technicalProperties" : null,
"fileSystemPath" : "/data/donghua/imdb_1000.csv",
"type" : "FILE",
"size" : 91499,
"created" : "2018-06-03T00:07:55.434Z",
"lastModified" : "2018-06-03T00:07:55.434Z",
"lastAccessed" : "2018-06-03T00:07:54.880Z",
"permissions" : "rw-r--r--",
"owner" : "donghua",
"group" : "hive",
"blockSize" : 134217728,
"mimeType" : "application/octet-stream",
"ezkeyName" : null,
"replication" : 1,
"metaClassName" : "fselement",
"deleted" : false,
"packageName" : "nav",
"userEntity" : false,
"sourceType" : "HDFS",
"identity" : "20388",
"internalType" : "fselement"
}

Donghua's Blog - DBAGlobe