Sunday, July 29, 2018

Use Jupyter Notebook with Spark2 on Cloudera

Step 1, find out necessary envronment variables

[donghua@cdh-vm ~]$ cat getPythonEnv.sh
import os
print "SPARK_HOME: %s"%(os.environ['SPARK_HOME'])
print "HADOOP_CONF_DIR: %s"%(os.environ['HADOOP_CONF_DIR'])
print "SPARK_CONF_DIR: %s"%(os.environ['SPARK_CONF_DIR'])
print "PYTHONPATH: %s"%(os.environ['PYTHONPATH'])
print "PYTHONSTARTUP: %s"%(os.environ['PYTHONSTARTUP'])
print "PYSPARK_SUBMIT_ARGS: %s"%(os.environ['PYSPARK_SUBMIT_ARGS'])

[donghua@cdh-vm ~]$ pyspark2
SPARK_HOME: /opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2
HADOOP_CONF_DIR: /opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/conf/yarn-conf
SPARK_CONF_DIR: /opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/conf
PYTHONPATH: /opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/python/lib/py4j-0.10.6-src.zip:/opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/python/:
PYTHONSTARTUP: /opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/python/pyspark/shell.py
PYSPARK_SUBMIT_ARGS: "--name" "PySparkShell" "pyspark-shell"
[donghua@cdh-vm ~]$

Step 2: Prepare the kernel file, below is one workable example

mkdir /opt/anaconda2/share/jupyter/kernels/pyspark2/

[root@cdh-vm bin]# cat  /opt/anaconda2/share/jupyter/kernels/pyspark2/kernel.json
    {
      "argv": [
        "python2.7",
        "-m",
        "ipykernel_launcher",
        "-f",
        "{connection_file}"
      ],
      "display_name": "Python2.7 + Pyspark(Spark 2.3.0)",
      "language": "python",
      "env": {
        "PYSPARK_PYTHON": "/opt/anaconda2/bin/python2.7",
        "SPARK_HOME": "/opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2",
        "HADOOP_CONF_DIR": "/opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/conf/yarn-conf",
        "SPARK_CONF_DIR": "/opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/conf",
        "PYTHONPATH": "/opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/python/lib/py4j-0.10.6-src.zip:/opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/python/:",
        "PYTHONSTARTUP": "/opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/python/pyspark/shell.py",
        "PYSPARK_SUBMIT_ARGS": "--name 'Jupyter Notebook' --master yarn --deploy-mode client pyspark-shell"
      }
    }


Start the notebook:

[donghua@cdh-vm ~]$ /opt/anaconda2/bin/jupyter-notebook --ip=192.168.31.238 --port 9999