Bokeh Basics I
Bokeh Basics - I
Bokeh is a python library that enables users to create beautiful, dynamic and interactive visualizations.
To learn more about Bokeh visit its website
Want to create visualizations? Let’s get started!
# import required libraries
from bokeh.io import output_notebook, show, reset_output, output_file
import bokeh
from bokeh.plotting import figure
import numpy as np
import pandas as pd
# import library for toy datasets
from vega_datasets import data as vds
Load Data
We need data to plot! Bokeh provides example datsets we can use.
from bokeh.sampledata import iris
# load iris dataset
df_iris = iris.flowers
# display first five rows in the df
df_iris.head()
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
# To display the graphs/output we need to run `output_notebook()` once if using JupyterLab
# and in every cell that would return a graph if using Colab
output_notebook()
Creating Plots
To create plots we must follow this workflow:
- Create a figure –
- Create a glyph/plot. We have several options: line, bar, scatter.
- show plot
Bokeh’s Data Structure
Bokeh uses the ColumnDataSource as its main data structure. The ColumnDataSource is a table-like data structure that maps string column names to sequences of data (columns). The ColumnDataSource is created automatically most of the time but it can also be created explicitly by passing a pandas dataframe to the class initializer:
data = ColumnDataSource(df)
# to create the ColumnDataSource
from bokeh.models import ColumnDataSource
df = ColumnDataSource({'A' : [1, 2, 3, 4, 5],
'B' : [5, 4, 3, 2, 1],
'C' : [1, 3, 5, 1, 2 ]})
df.data
{'A': [1, 2, 3, 4, 5], 'B': [5, 4, 3, 2, 1], 'C': [1, 3, 5, 1, 2]}
Create a Line Plot
We can create some random data to pass as our x and y values.
# plot a linear graph
from bokeh.models import HoverTool
# create toy data
x_ax = np.arange(10)
y_ax = np.random.rand(10)
# Create plot
line_plot = figure(plot_width=600, plot_height=425, title='Line Plot', x_axis_label='X', y_axis_label='Y')
line_plot.line(x_ax, y_ax, legend_label='line', line_width=2)
# add hover tool
line_plot.add_tools(HoverTool())
output_file('/line_chart.html')
show(line_plot)
Creating a multi-variable line plot
output_notebook()
# create some random data
x_multi = np.arange(10)
y1_multi = np.random.rand(10)
y2_multi = np.random.rand(10)
y3_multi = np.random.rand(10)
# crete instance of the plot
multi_var_plot = figure(plot_width=600, plot_height=400, toolbar_location='below')
multi_var_plot.line(x_multi, y1_multi, color='yellow', line_width=4, legend_label='y1')
multi_var_plot.line(x_multi, y2_multi, color='blue', line_width=4, legend_label='y2')
multi_var_plot.line(x_multi, y3_multi, color='red', line_width=4, legend_label='y3')
multi_var_plot.add_tools(HoverTool())
output_file('/multiline_chart.html')
show(multi_var_plot)
Creating Bar Charts
# create random data
x_ax = ['cat1', 'cat2', 'cat3', 'cat4', 'cat5']
y_ax = np.random.rand(5) * 10
# sort data
sorted_cat = sorted(x_ax, key=lambda x: y_ax[x_ax.index(x)], reverse=True)
# Create instance of the bar chart
bar_chart = figure(x_range=sorted_cat, title='Bar Chart', x_axis_label='X', y_axis_label='Y', plot_height=300)
# use vbar for vertical and hvar for horizontal
bar_chart.vbar(x_ax, top=y_ax, color='blue', width=0.4)
bar_chart.y_range.start = 0
bar_chart.add_tools(HoverTool())
output_file('/bar_chart.html')
show(bar_chart)
Stacked Bar Chart
# Stacked Bar Chart
df_stacked = pd.DataFrame({'y': [1, 2, 3, 4, 5],
'x1': [1,2, 4, 3, 4],
'x2' : [1, 4, 2, 2, 3]})
df_CDS_tacked = ColumnDataSource(df_stacked)
stacked_bar_chart = figure(plot_width=600, plot_height=300, title='Stacked Bar Chart')
stacked_bar_chart.hbar_stack(['x1', 'x2'],
y = 'y',
height = 0.8,
color = ('green', 'lightgreen'),
source=df_stacked
)
stacked_bar_chart.add_tools(HoverTool())
output_file('/stacked_bar_chart.html')
show(stacked_bar_chart)
Creating a Bar Chart Grouping Data
from bokeh.transform import dodge
# create some random data
categories = ['category1', 'category2', 'category3']
df_grouped = pd.DataFrame({'categories' : categories,
'2018' : [2, 1, 4],
'2019' : [5, 3, 3],
'2020' : [3, 2, 4]})
# create instance of a figure
bar_grouped = figure(x_range=categories, y_range = (0, 10), plot_height = 350)
# define position of bars on chart
dodge1 = dodge('categories', -0.25, range=bar_grouped.x_range)
dodge2 = dodge('categories', 0.0, range=bar_grouped.x_range)
dodge3 = dodge('categories', 0.25, range=bar_grouped.x_range)
bar_grouped.vbar(x=dodge1, top='2018', width=0.2, source=df_grouped, color='blue', legend_label='2018')
bar_grouped.vbar(x=dodge2, top='2019', width=0.2, source=df_grouped, color='green', legend_label='2019')
bar_grouped.vbar(x=dodge3, top='2020', width=0.2, source=df_grouped, color='red', legend_label='2020')
# configure legend
bar_grouped.legend.location = 'top_left'
bar_grouped.legend.orientation = 'horizontal'
bar_grouped.add_tools(HoverTool())
output_file('/grouped_bar_chart.html')
show(bar_grouped)
Stacked Area Chart
# create dummy data for the chart
df_area_stacked = pd.DataFrame({'x' : [1, 2, 3, 4, 5],
'y1' : [1, 3, 1, 4, 5],
'y2' : [1, 2, 3, 4, 2]})
stacked_area_chart = figure(plot_width=600, plot_height=300)
stacked_area_chart.varea_stack(['y1', 'y2'],
x = 'x',
color = ('coral', 'cadetblue'),
source = df_area_stacked)
show(stacked_area_chart)
Scatter Plots
Load the car dataset from vega
df_cars = vds.cars()
df_cars.head()
Name | Miles_per_Gallon | Cylinders | Displacement | Horsepower | Weight_in_lbs | Acceleration | Year | Origin | |
---|---|---|---|---|---|---|---|---|---|
0 | chevrolet chevelle malibu | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 1970-01-01 | USA |
1 | buick skylark 320 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 1970-01-01 | USA |
2 | plymouth satellite | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 1970-01-01 | USA |
3 | amc rebel sst | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 1970-01-01 | USA |
4 | ford torino | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 1970-01-01 | USA |
Scatter Plot for Categorical Data
We can use the famous iris dataset to plot categorical data. This dataset contains attributes about three different flower species: setosa, versicolor, virginica.
# load iris df from vega
df_iris = vds.iris()
df_iris.head()
sepalLength | sepalWidth | petalLength | petalWidth | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
from bokeh.transform import factor_cmap, factor_mark
# load data
species = ['setosa', 'versicolor', 'virginica']
markers = ['hex', 'cross', 'triangle']
sub_scatter = figure(plot_width=600,
plot_height=400,
title='Iris Scatter Plot',
x_axis_label='Petal Length',
y_axis_label='Petal Width')
sub_scatter.scatter(x='petalLength',
y='petalWidth',
source=df_iris,
legend_label='species',
fill_alpha=0.5,
size=15,
color=factor_cmap(field_name='species', palette='Dark2_4', factors=species),
marker=factor_mark('species', markers, species))
sub_scatter.legend.location="top_left"
output_file('/sub_scatter.html')
show(sub_scatter)