Access data in relational databases

If your notebook needs to connect directly to a relational database with your own custom SQL queries, then you can use APIs from third-party modules:

Scala:: org.apache.spark
Python:: jaydbeapi
R:: rJDBC for Jupyter and SparkR for Zeppelin

Recommendation: To automate connections to your databases, add them as data sources to your project from the Watson Studio Local client.

To use the jaydebeapi APIs to create JDBC connections to databases, import the following libraries in your notebook:

import jaydebeapi

When the modules are imported, you can form the JDBC string and call the connect function along with a className to create the connection. The JDBC string is specific to each database and needs the driver className included when the connection function is called.

Python example for Db2 and Db2 Warehouse on Cloud (previously known as dashDB)

import jaydebeapi, sys

#Enter the values for you database connection
dsn_database = "BLUDB"            # e.g. "BLUDB" Name of the database
dsn_hostname = "dashdb.services.dal.bluemix.net" # e.g.: "bluemix05.bluforcloud.com"
dsn_port = "50000"                # e.g. "50000" Database port number
dsn_uid = "dashxxxx"        # e.g. "dash104434" User id
dsn_pwd = "xxxxxxx"       # e.g. "7dBZ3jWt9xN6$o0JiX!m" User password for the database

connection_string='jdbc:db2://'+dsn_hostname+':'+dsn_port+'/'+dsn_database
if (sys.version_info >= (3,0)):
    conn = jaydebeapi.connect("com.ibm.db2.jcc.DB2Driver", connection_string, [dsn_uid, dsn_pwd])
else:
    conn = jaydebeapi.connect("com.ibm.db2.jcc.DB2Driver", [connection_string, dsn_uid, dsn_pwd])

curs = conn.cursor()
curs.execute("select * from db2inst1.sales")
curs.fetchall()

Python example for an SSL datasource (Db2/BIGSQL)

import jaydebeapi, sys

#Enter the values for you database connection
dsn_database = "BIGSQL"            # e.g. "BLUDB"
dsn_hostname = "c04n1.ibm.com" # e.g.: "bluemix05.bluforcloud.com"
dsn_port = "52000"                # e.g. "50000" 
dsn_uid = "raguk_ca_ibm_com"        # e.g. "dash104434"
dsn_pwd = ""       # e.g. "7dBZ3jWt9xN6$o0JiX!m"

connection_string='jdbc:db2://'+dsn_hostname+':'+dsn_port+'/'+dsn_database:sslConnection=true;sslTrustStoreLocation=/user-home/_global_/security/customer-truststores/cacerts;'

if (sys.version_info >= (3,0)):
    conn = jaydebeapi.connect("com.ibm.db2.jcc.DB2Driver", connection_string, [dsn_uid, dsn_pwd])
else:
    conn = jaydebeapi.connect("com.ibm.db2.jcc.DB2Driver", [connection_string, dsn_uid, dsn_pwd])

curs = conn.cursor()
curs.execute("select * from db2inst1.sales")
curs.fetchall()

Python example for Netezza

import jaydebeapi, sys

#Enter the values for you database connection

dsn_database2 = "SYSTEM"            # e.g. "BLUDB"
dsn_hostname2 = "9.87.654.321" # e.g.: "hostname.site.com"
dsn_port2 = "5480"                # e.g. "50000"
dsn_uid2 = "admin"        # e.g. "dash104434"
dsn_pwd2 = "XXXXX"       # e.g. "7dBZ3jWt9xN6$o0JiX!m"

connection_string2='jdbc:netezza://'+dsn_hostname2+':'+dsn_port2+'/'+dsn_database2

if (sys.version_info >= (3,0)):
    conn = jaydebeapi.connect("org.netezza.Driver", connection_string2, [dsn_uid2, dsn_pwd2])
else:
    conn = jaydebeapi.connect("org.netezza.Driver", [connection_string2, dsn_uid2, dsn_pwd2])

curs = conn.cursor()
curs.execute("select * from FOOD")
curs.fetchall()
curs.close()

Python example for Informix

import jaydebeapi, sys

#Enter the values for you database connection

dsn_database2 = "stores_demo"            # e.g. "BLUDB"
dsn_hostname2 = "9.87.654.321" # e.g.: "hostname.site.com"
dsn_port2 = "9088"                # e.g. "50000"
dsn_uid2 = "informix"        # e.g. "dash104434"
dsn_pwd2 = "XXXXX"       # e.g. "7dBZ3jWt9xN6$o0JiX!m"
ifxserver = "dev"

connection_string2='jdbc:informix-sqli://'+dsn_hostname2+':'+dsn_port2+'/'+dsn_database2+':INFORMIXSERVER='+ifxserver+';user='+dsn_uid2+';password='+dsn_pwd2+";"

if (sys.version_info >= (3,0)):
    conn = jaydebeapi.connect("com.informix.jdbc.IfxDriver", connection_string2, [dsn_uid2, dsn_pwd2])
else:
    conn = jaydebeapi.connect("com.informix.jdbc.IfxDriver", [connection_string2, dsn_uid2, dsn_pwd2])

curs = conn.cursor()
curs.execute("select * from customer")
curs.fetchall()
curs.close()

Python example for Oracle

import dsx_core_utils, os, io
import pandas as pd
import cx_Oracle

#Read datasource information
dataSet = dsx_core_utils.get_remote_data_set_info('oracle-rds-tabbig2')
dataSource = dsx_core_utils.get_data_source_info(dataSet['datasource'])
df = None

# Example of URL defined for Datasource: 
# jdbc:oracle:thin:<user>/<password>@<ip>:<port>:<sid>
# 1. Extracting <ip>:<port>:<sid> from URL string
dbServer = dataSource['URL'].split('/')[1].split('@')[1]

# 2. Extracting ip, port, sid/service-name from dbServer
ip = dbServer.split(':')[0]
port = dbServer.split(':')[1]
sid = dbServer.split(':')[2]

# makedsn - Returns a string suitable for use as the dsn parameter for connect()
dsn = cx_Oracle.makedsn(ip, port, sid)

connection = cx_Oracle.connect(dataSource['user'], dataSource['password'], dsn)
cursor = connection.cursor()
cursor.execute('select * from ' + (dataSet['schema'] + '.' if (len(dataSet['schema'].strip()) != 0) else '') 
+ dataSet['table'] + ' FETCH NEXT 1000000 ROWS ONLY')
queryData = cursor.fetchall()

#Read column names to add to Pandas dataframe
colNames = []
for i in range(0, len(cursor.description)):
    colNames.append(cursor.description[i][0])
    
cursor.close()
connection.close()
df = pd.DataFrame(data=queryData, columns=colNames)
df.head(5)

Other databases

If you need to connect to a database that Watson Studio Local does not already provide a driver for, you must first [import the JDBC driver for it. Then, your notebook can connect directly to them.

In your notebook, the CLASSPATH must include the imported database driver as follows:

import os
os.environ["CLASSPATH"] = os.environ["CLASSPATH"]+":/usr/local/spark/jars/<driverFileName>"
os.environ["CLASSPATH"]

You might need to restart the kernel of the notebook or the interpreter in Zeppelin.

Python example for PostgreSQL

The following example hardcodes the database details in the notebook to access data from your database. For R, the path of the JAR file is specified. The imported JAR files are located in /usr/local/spark/jars.

#Enter the values for you database connection
dsn_database = "postgres"            # e.g. "BLUDB"
dsn_hostname = "9.87.654.321" # e.g.: "bluemix05.bluforcloud.com"
dsn_port = "5432"                # e.g. "50000"
dsn_uid = "postgres"        # e.g. "dash104434"
dsn_pwd = "XXXXX"       # e.g. "7dBZ3jWt9xN6$o0JiX!m"

connection_string='jdbc:postgresql://'+dsn_hostname+':'+dsn_port+'/'+dsn_database

if (sys.version_info >= (3,0)):
    conn = jaydebeapi.connect("org.postgresql.Driver", connection_string, [dsn_uid, dsn_pwd])
else:
    conn = jaydebeapi.connect("org.postgresql.Driver", [connection_string, dsn_uid, dsn_pwd])

curs = conn.cursor()
curs.execute("select * from myschema.cars")
curs.fetchall()
curs.close()

R example for PostgreSQL

The following R example creates a JDBC connection to PostgreSQL.

driverClassName <- "org.postgresql.Driver"
driverPath <- "/usr/local/spark/jars/postgresql-42.1.4.jar"
url <- "jdbc:postgresql://9.87.654.321:5432/postgres"
databaseUsername <- "postgres"
databasePassword <- "hot6cold"
databaseSchema <- "myschema"
databaseTable <- "cars"
drv <- JDBC(driverClassName, driverPath)
conn <- dbConnect(drv, url, databaseUsername, databasePassword)
data <- dbReadTable(conn, paste(databaseSchema, '.', databaseTable, sep=""))

Python example for MS SQL Server

The following example requires the MS SQL pymssql Python client module. If you need to install it, see Install a global Python library.

import dsx_core_utils, os, io, sys
import pandas as pd
import pymssql
df4 = None
# Read data set and data source information from Watson Studio Local
dataSet = dsx_core_utils.get_remote_data_set_info('mssql-customer')
dataSource = dsx_core_utils.get_data_source_info(dataSet['datasource'])
# jdbc:sqlserver://9.30.54.105:1433;databaseName=master
dbServer = dataSource['URL'].split('//')[1].split(';')[0].split(':')[0]
dbDatabase = dataSource['URL'].split('//')[1].split(';')[1].split('=')[1]
# Establish a connection to MS Sql Database
conn = pymssql.connect(server=dbServer, user=dataSource['user'], password=dataSource['password'], database=dbDatabase) 
# Define the SQL query
query = 'select * from ' + (dataSet['schema'] + '.' if (len(dataSet['schema'].strip()) != 0) else '') + dataSet['table']
# Execute the query and fetch data
curs = conn.cursor()
curs.execute(query)
queryData = curs.fetchall()

# Attach column names to pandas dataFrame
colNames = []
for i in range(0, len(curs.description)):
    colNames.append(curs.description[i][0])
df4 = pd.DataFrame(data=queryData, columns=colNames)
df4.head(10)

Connect to Alluxio using Watson Studio Local Spark

To connect to Alluxio using the Watson Studio Local cluster, you must first ensure the impersonation is properly configured on the Alluxio server by add the following lines in the Alluxio server configuration file conf/alluxio-site.properties (in this example, root act on behalf of other users (impersonation users)):

alluxio.master.security.impersonation.root.users=*
alluxio.master.security.impersonation.root.groups=*

See Alluxio impersonation documentation for details.

Requirement: The Watson Studio Local administrator must add import the JDBC driver for the Alluxio client.

Python 3.5 notebook example for reading data from Alluxio:

from pyspark.sql import SparkSession
spark = SparkSession(sc).builder.getOrCreate()
spark.read.csv("alluxio://9.87.654.321:19998/tennis.csv").show()