Friday, August 16, 2019

How to limit NiFi resource usage

File to modify: /opt/nifi-1.9.2/bin/nifi.sh

    if [ "$1" = "start" ]; then
        ( eval "cd ${NIFI_HOME} && ${run_nifi_cmd}" & )> /dev/null 1>&-
        sleep 10
        taskset -pc 0 `cat /opt/nifi-1.9.2/run/nifi.pid`
    else


To verify:

root@devops bin]# ./nifi.sh start
nifi.sh: JAVA_HOME not set; results may vary

Java home:
NiFi home: /opt/nifi-1.9.2

Bootstrap Config File: /opt/nifi-1.9.2/conf/bootstrap.conf

pid 4427's current affinity list: 0-2
pid 4427's new affinity list: 0

[root@devops bin]# taskset  -p `cat /opt/nifi-1.9.2/run/nifi.pid`
pid 4427's current affinity mask: 1



Tuesday, August 6, 2019

Tips to identify yarn applications from resource manager for Hive Tez sessions

How To:

Use "hive.query.tag" at the connection string or set using "set hive.query.tag=usertag"

beeline -n hive -u jdbc:hive2://hdp:10000/default?hive.query.tag=donghuatag

Output:

[root@hdp ~]# yarn application -list -appTags donghuatag
19/08/06 21:52:30 INFO client.RMProxy: Connecting to ResourceManager at hdp.dbaglobe.com/192.168.31.22:8050
Total number of applications (application-types: [], states: [SUBMITTED, ACCEPTED, RUNNING] and tags: [donghuatag]):1
                Application-Id      Application-Name      Application-Type        User       Queue               State         Final-State         Progress                        Tracking-URL
application_1565096253064_0004  HIVE-7c5ec7e6-6944-483d-bf2d-b7a2d18ad7c1                  TEZ        hive     default             RUNNING           UNDEFINED               0%                http://hdp:41592/ui/

Monday, June 3, 2019

Ansible playbook - prepare the target demo machine

More Scripts:


prepare_ansible_target.yaml
---
# Run with ansible-playbook  -k
#  (make sure to add the IPs of machines you want to manage to /etc/ansibles/hosts first)

- hosts: gw
  gather_facts: False
  remote_user: root
  # become: yes
  # become_user: root
  # become_method: sudo

  tasks:
    - name: Install python 2
      raw: test -e /usr/bin/python || (yum update -y && yum install -y python)

    - name: Fancy way of doing authorized_keys
      authorized_key: user=root
                      exclusive=no
                      key="{{ lookup('file', '~/.ssh/id_rsa.pub') }}"

    #- name: Create /root/.ssh
    #  file: path=/root/.ssh state=directory mode=0700

    #- name: Create /root/.ssh/authorized_keys from our local ssh pubkey
    #  lineinfile: dest=/root/.ssh/authorized_keys lines="{{ lookup('file','~/.ssh/id_rsa.pub') }}"

    - name: Set timezone to Asia/Singapore
      raw: ( timedatectl |grep "Time zone" |grep -i Singapore) || timedatectl set-timezone Asia/Singapore
      
    - name: Add lines to avoid "LC_TYPE" error in Mac OSX terminal
      lineinfile: dest=/etc/environment line='{{ item }}' state=present backup=yes
      with_items:
        - '# Add following 2 lines to fix LC_TYPE error'
        - 'LANG=en_US.utf-8'
        - 'LC_ALL=en_US.utf-8'

host
[all]
Donghuas-MacBook-Air.local

[gw]
192.168.31.78 ansible_python_interpreter=/usr/bin/python ansible_user=root
Execute the playbook
Donghuas-MacBook-Air:ansible donghua$ ansible-playbook prepare_ansible_target.yaml -i host 

PLAY [gw] ****************************************************************************************************************************************

TASK [Install python 2] **************************************************************************************************************************
changed: [192.168.31.78]

TASK [Fancy way of doing authorized_keys] ********************************************************************************************************
ok: [192.168.31.78]

TASK [Set timezone to Asia/Singapore] ************************************************************************************************************
changed: [192.168.31.78]

TASK [Add lines to avoid "LC_TYPE" error in Mac OSX terminal] ************************************************************************************
ok: [192.168.31.78] => (item=# Add following 2 lines to fix LC_TYPE error)
ok: [192.168.31.78] => (item=LANG=en_US.utf-8)
ok: [192.168.31.78] => (item=LC_ALL=en_US.utf-8)

PLAY RECAP ***************************************************************************************************************************************
192.168.31.78              : ok=4    changed=2    unreachable=0    failed=0    skipped=0    rescued=0    ignored=0   

Saturday, May 4, 2019

Convert tables in all PDF pages into CSV


(bigdata) Donghuas-MacBook-Air:Documents donghua$ pip install tabula-py
(bigdata) Donghuas-MacBook-Air:Documents donghua$ python
Python 2.7.15 |Anaconda, Inc.| (default, Dec 14 2018, 13:10:39) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import tabula
>>> tabula.convert_into("/Users/donghua/Documents/SASMO-Result-Grade-3.pdf", "/Users/donghua/Documents/SASMO-Result-Grade-3.csv", output_format="csv",pages="all")
May 04, 2019 12:36:19 AM org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB suggestKCMS
INFO: To get higher rendering speed on JDK8 or later,
May 04, 2019 12:36:19 AM org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB suggestKCMS
INFO:   use the option -Dsun.java2d.cmm=sun.java2d.cmm.kcms.KcmsServiceProvider
May 04, 2019 12:36:19 AM org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB suggestKCMS



Wednesday, April 17, 2019

Differences between Parquet and ORC formats

ORC format
0: jdbc:hive2://hdp.dbaglobe.com:10000/demodb> create external table managed_t1 (id int, name string, remark string) stored as orc location '/data/managed_t1' tblproperties('transactional'='false');

0: jdbc:hive2://hdp.dbaglobe.com:10000/demodb> alter table managed_t1 replace columns (id int, name string);
INFO  : Compiling command(queryId=hive_20190417210521_b1634dd1-280f-41c2-96a1-d0e921c64041): alter table managed_t1 replace columns (id int, name string)
INFO  : Semantic Analysis Completed (retrial = false)
INFO  : Returning Hive schema: Schema(fieldSchemas:null, properties:null)
INFO  : Completed compiling command(queryId=hive_20190417210521_b1634dd1-280f-41c2-96a1-d0e921c64041); Time taken: 0.047 seconds
INFO  : Executing command(queryId=hive_20190417210521_b1634dd1-280f-41c2-96a1-d0e921c64041): alter table managed_t1 replace columns (id int, name string)
INFO  : Starting task [Stage-0:DDL] in serial mode
ERROR : FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. Replacing columns cannot drop columns for table demodb.managed_t1. SerDe may be incompatible
INFO  : Completed executing command(queryId=hive_20190417210521_b1634dd1-280f-41c2-96a1-d0e921c64041); Time taken: 0.021 seconds
Error: Error while processing statement: FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. Replacing columns cannot drop columns for table demodb.managed_t1. SerDe may be incompatible (state=42000,code=1)
Parquet format

0: jdbc:hive2://hdp.dbaglobe.com:10000/demodb> create external table managed_t1 (id int, name string, remark string) stored as parquet location '/data/managed_t1' tblproperties('transactional'='false');

0: jdbc:hive2://hdp.dbaglobe.com:10000/demodb> alter table managed_t1 replace columns (id int, name string);
INFO  : Compiling command(queryId=hive_20190417210231_90e8f8bd-bb03-4aa3-b138-e693ca59dc1b): alter table managed_t1 replace columns (id int, name string)
INFO  : Semantic Analysis Completed (retrial = false)
INFO  : Returning Hive schema: Schema(fieldSchemas:null, properties:null)
INFO  : Completed compiling command(queryId=hive_20190417210231_90e8f8bd-bb03-4aa3-b138-e693ca59dc1b); Time taken: 0.088 seconds
INFO  : Executing command(queryId=hive_20190417210231_90e8f8bd-bb03-4aa3-b138-e693ca59dc1b): alter table managed_t1 replace columns (id int, name string)
INFO  : Starting task [Stage-0:DDL] in serial mode
INFO  : Completed executing command(queryId=hive_20190417210231_90e8f8bd-bb03-4aa3-b138-e693ca59dc1b); Time taken: 0.148 seconds
INFO  : OK
No rows affected (0.278 seconds)

Sunday, April 7, 2019

Saturday, March 30, 2019

Script to analyze personal trip data exported from Grab

Script to analyze personal trip data exported from Grab:


import pandas as pd
import numpy as np

df = pd.read_csv('/Users/donghua/Documents/Grab-history-20180-201903.csv',skiprows=3,skipfooter=3,engine='python')
# df.head(1)
df['Date']=pd.to_datetime(df['Date'], format='%a %d %b %Y, %I:%M:%S %p')
df['Month']=df['Date'].dt.strftime('%Y-%m')
# df.head(1)

df[['Month','Fare']].groupby('Month').sum().sort_index()
df[['Month','Fare']].groupby('Month').mean().sort_index()

import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
df[['Month','Fare']].groupby('Month').sum().plot.bar();
df[['Month','Fare']].groupby('Month').mean().plot.bar();
Jupyter Notebook Output:
import pandas as pd
import numpy as np
df = pd.read_csv('/Users/donghua/Documents/Grab-history-20180-201903.csv',skiprows=3,skipfooter=3,engine='python')
# df.head(1)
df['Date']=pd.to_datetime(df['Date'], format='%a %d %b %Y, %I:%M:%S %p')
df['Month']=df['Date'].dt.strftime('%Y-%m')
# df.head(1)
df[['Month','Fare']].groupby('Month').sum().sort_index()

df[['Month','Fare']].groupby('Month').mean().sort_index()


import matplotlib.pyplot as plt
%matplotlib inline
df[['Month','Fare']].groupby('Month').sum().plot.bar();
df[['Month','Fare']].groupby('Month').mean().plot.bar();