!python -V

Python 3.8.8


# installing using pip
!pip install seaborn

# installing using conda
#!conda install seaborn


import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# read the data from the csv files
covid_ca = pd.read_csv("https://covidtracking.com/data/download/california-history.csv")
covid_pa = pd.read_csv("https://covidtracking.com/data/download/pennsylvania-history.csv")
covid_ma = pd.read_csv("https://covidtracking.com/data/download/massachusetts-history.csv")
# concatenate two data frames 
covid = pd.concat([covid_ca, covid_pa, covid_ma])


# retain only the five columns listed
covid = covid[["date", "state", "deathIncrease", "hospitalizedCurrently", "totalTestResultsIncrease"]]

# change the date from string to datetime
covid["date"] = pd.to_datetime(covid["date"], format = "%Y-%m-%d")

# keep data between 2020-04-01 to 2020-12-31
covid = covid[(covid["date"] >= "2020-04-01") & (covid["date"] <= "2020-12-31")]

# add a new column hospitalizedLevel
def addHospitalizedLevel(row):
    if row["hospitalizedCurrently"] >= 10000:
        return "high"
    elif row["hospitalizedCurrently"] < 5000:
        return "low"
    else:
        return "medium"
covid["hospitalizedLevel"] = covid.apply(addHospitalizedLevel, axis = 1)

# specify categorical variables
covid = covid.astype({"state": "category", "hospitalizedLevel":"category"})

# show first five row of the data frame
covid.head()


# show dimension of the data frame
covid.shape

(825, 6)


# set the figure size using matplotlib
plt.figure(figsize=(8, 5))

# histogram with count on the y-axis
sns.histplot(x = "deathIncrease", data = covid)

<AxesSubplot:xlabel='deathIncrease', ylabel='Count'>


plt.figure(figsize=(8, 5))

# histogram with density on the y-axis
sns.histplot(x = "deathIncrease", data = covid, stat = "density")

<AxesSubplot:xlabel='deathIncrease', ylabel='Density'>


plt.figure(figsize=(8, 5))

# specify the total number of bins
sns.histplot(x = "deathIncrease", data = covid, bins = 10)

<AxesSubplot:xlabel='deathIncrease', ylabel='Count'>


plt.figure(figsize=(8, 5))

# specify the breaks of the bins
sns.histplot(x = "deathIncrease", data = covid, 
             bins = (0, 100, 200, 300, 400, 500, 600))

<AxesSubplot:xlabel='deathIncrease', ylabel='Count'>


plt.figure(figsize=(8, 5))

# specify the width of each bin
sns.histplot(x = "deathIncrease", data = covid, binwidth = 50)

<AxesSubplot:xlabel='deathIncrease', ylabel='Count'>


plt.figure(figsize=(8, 5))

# limit the histogram to between 0 and the maximum value of deathIncrease
sns.histplot(x = "deathIncrease", data = covid, 
             binrange = (0, max(covid["deathIncrease"])))

<AxesSubplot:xlabel='deathIncrease', ylabel='Count'>


plt.figure(figsize=(8, 5))

# histogram with density on the y-axis and kernel density estimate overlayed
sns.histplot(x = "deathIncrease", data = covid, stat = "density", kde = True)

<AxesSubplot:xlabel='deathIncrease', ylabel='Density'>


plt.figure(figsize=(8, 5))

# histogram with a different color
sns.histplot(x = "deathIncrease", data = covid, color = "darksalmon")

<AxesSubplot:xlabel='deathIncrease', ylabel='Count'>


plt.figure(figsize=(8, 5))

# kernal density estimate
sns.kdeplot(x = "deathIncrease", data = covid)

<AxesSubplot:xlabel='deathIncrease', ylabel='Density'>


plt.figure(figsize=(8, 5))

# kernal density estimate with area under curve filled
sns.kdeplot(x = "deathIncrease", data = covid, 
            color = "darksalmon", fill = True)

<AxesSubplot:xlabel='deathIncrease', ylabel='Density'>


plt.figure(figsize=(8, 5))

# limit the density curve to between 0 and the maximum value of deathIncrease
sns.kdeplot(x = "deathIncrease", data = covid, 
            clip = (0, max(covid["deathIncrease"])))

<AxesSubplot:xlabel='deathIncrease', ylabel='Density'>


plt.figure(figsize=(8, 5))

# less smoothing
sns.kdeplot(x = "deathIncrease", data = covid, bw_adjust = 0.5)

<AxesSubplot:xlabel='deathIncrease', ylabel='Density'>


plt.figure(figsize=(8, 5))

# more smoothing
sns.kdeplot(x = "deathIncrease", data = covid, bw_adjust = 3)

<AxesSubplot:xlabel='deathIncrease', ylabel='Density'>


plt.figure(figsize=(8, 5))

# estimate the cumulative density
sns.kdeplot(x = "deathIncrease", data = covid, cumulative = True)

<AxesSubplot:xlabel='deathIncrease', ylabel='Density'>


plt.figure(figsize=(8, 4))

# box plot - horizontal
sns.boxplot(x = "deathIncrease", data = covid)

<AxesSubplot:xlabel='deathIncrease'>


plt.figure(figsize=(4, 6))

#box plot - vertical
sns.boxplot(y = "deathIncrease", data = covid, color = "darksalmon")

<AxesSubplot:ylabel='deathIncrease'>


plt.figure(figsize=(8, 4))

# strip plot
sns.stripplot(x = "deathIncrease", data = covid)

<AxesSubplot:xlabel='deathIncrease'>


plt.figure(figsize=(8, 4))

# strip plot with more jitter
sns.stripplot(x = "deathIncrease", data = covid, jitter = 0.3)

<AxesSubplot:xlabel='deathIncrease'>


plt.figure(figsize=(8, 4))

# strip plot on top of box plot
sns.boxplot(x = "deathIncrease", data = covid)
sns.stripplot(x = "deathIncrease", data = covid, 
              color = "darksalmon", jitter = 0.4, alpha = 0.4)

<AxesSubplot:xlabel='deathIncrease'>


plt.figure(figsize=(4, 6))

# strip plot on top of box plot, coordinates flipped
sns.boxplot(y = "deathIncrease", data = covid)
sns.stripplot(y = "deathIncrease", data = covid, 
              color = "darksalmon", jitter = 0.4, alpha = 0.4)

<AxesSubplot:ylabel='deathIncrease'>


plt.figure(figsize=(8, 4))

# swarm plot
sns.swarmplot(x = "deathIncrease", data = covid.iloc[0:300,])

<AxesSubplot:xlabel='deathIncrease'>


plt.figure(figsize=(4, 6))

# swarm plot on top of box plot, coordinates flipped
sns.boxplot(y = "deathIncrease", data = covid)
sns.swarmplot(y = "deathIncrease", data = covid.iloc[0:300,], color = "darksalmon", alpha = 0.5)

<AxesSubplot:ylabel='deathIncrease'>


plt.figure(figsize=(8, 4))

# violin plot
sns.violinplot(x = "deathIncrease", data = covid)

<AxesSubplot:xlabel='deathIncrease'>


plt.figure(figsize=(8, 4))

# violin plot
sns.violinplot(x = "deathIncrease", data = covid, inner = "quartile")

<AxesSubplot:xlabel='deathIncrease'>


plt.figure(figsize=(8, 5))

# layered histogram
sns.histplot(x = "deathIncrease", data = covid, hue = "state")

<AxesSubplot:xlabel='deathIncrease', ylabel='Count'>


plt.figure(figsize=(8, 5))

# stacked histogram
sns.histplot(x = "deathIncrease", data = covid, 
             hue = "state", multiple = "stack")

<AxesSubplot:xlabel='deathIncrease', ylabel='Count'>


plt.figure(figsize=(8, 5))

# histogram showing conditional probability of each category at each bin
sns.histplot(x = "deathIncrease", data = covid, 
             hue = "state", multiple = "fill").set(ylabel = "conditional probability")

[Text(0, 0.5, 'conditional probability')]


# facetted histogram on two categorical variable
sns.displot(x = "deathIncrease", data = covid, 
            row = "hospitalizedLevel", col = "state", height = 3)

<seaborn.axisgrid.FacetGrid at 0x7faad4c22fa0>


plt.figure(figsize=(8, 5))

# kernal density estimate
sns.kdeplot(x = "deathIncrease", data = covid, hue = "state")

<AxesSubplot:xlabel='deathIncrease', ylabel='Density'>


# facetted density curve
sns.displot(x = "deathIncrease", data = covid, 
            col = "state", kind = "kde")

<seaborn.axisgrid.FacetGrid at 0x7faaf6715df0>


plt.figure(figsize=(6, 8))

# side-by-side box plot
sns.boxplot(x = "state", y = "deathIncrease", data = covid)

<AxesSubplot:xlabel='state', ylabel='deathIncrease'>


plt.figure(figsize=(10, 8))

# side-by-side box plot grouped by two categorical variables
sns.boxplot(x = "state", y = "deathIncrease", data = covid, 
            hue = "hospitalizedLevel")

<AxesSubplot:xlabel='state', ylabel='deathIncrease'>


# facetted box plots
sns.catplot(x = "state", y = "deathIncrease", data = covid, 
            col = "hospitalizedLevel", kind = "box", width = 0.5)

<seaborn.axisgrid.FacetGrid at 0x7faad8f58a90>


plt.figure(figsize=(8, 5))

# bar plot
sns.countplot(x = "hospitalizedLevel", data = covid)

<AxesSubplot:xlabel='hospitalizedLevel', ylabel='count'>


plt.figure(figsize=(8, 5))

# bar plot with levels rearranged
sns.countplot(x = "hospitalizedLevel", data = covid, 
              order = ["low", "medium", "high"])

<AxesSubplot:xlabel='hospitalizedLevel', ylabel='count'>


plt.figure(figsize=(8, 5))

# bar plot with percentage
sns.histplot(x = "hospitalizedLevel", data = covid, 
             stat = "probability", discrete = True)

<AxesSubplot:xlabel='hospitalizedLevel', ylabel='Probability'>


plt.figure(figsize=(5, 5))

# pie chart - demonsration of bad usage
dummy_data = [33, 36, 31]
plt.pie(dummy_data)

([<matplotlib.patches.Wedge at 0x7faadac52580>,
  <matplotlib.patches.Wedge at 0x7faadac52a60>,
  <matplotlib.patches.Wedge at 0x7faadac52ee0>],
 [Text(0.5599455183205815, 0.9468162527717273, ''),
  Text(-1.0978293924762963, -0.06906971127148508, ''),
  Text(0.6182918791840069, -0.9097885205557991, '')])


plt.figure(figsize=(8, 5))

# group the data by hospitalizedLevel, then find the sum of observations under each group
covid_temp = covid.groupby("hospitalizedLevel", as_index = False).size()
# point plot
sns.pointplot(x = "hospitalizedLevel", y = "size", data = covid_temp, 
              order = ["low", "medium", "high"]).set(ylabel = "count")

[Text(0, 0.5, 'count')]


plt.figure(figsize=(8, 5))

# side-by-side bar plot
sns.countplot(x = "hospitalizedLevel", data = covid, 
              order = ["low", "medium", "high"], hue = "state")

<AxesSubplot:xlabel='hospitalizedLevel', ylabel='count'>


# facetted bar plot
sns.catplot(x = "hospitalizedLevel", data = covid, 
            order = ["low", "medium", "high"], col = "state", kind = "count")

<seaborn.axisgrid.FacetGrid at 0x7faaf8a4bdc0>


plt.figure(figsize=(8, 5))

# group the data by hospitalizedLevel and state, then find the sum of observations under each group
covid_temp = covid.groupby(["hospitalizedLevel", "state"], as_index = False).size()

# point plot with multiple layers
sns.pointplot(x = "hospitalizedLevel", y = "size", data = covid_temp, 
              order = ["low", "medium", "high"], hue = "state").set(ylabel = "count")

[Text(0, 0.5, 'count')]


plt.figure(figsize=(8, 5))

# point plot with multiply layers, dodged
sns.pointplot(x = "hospitalizedLevel", y = "size", data = covid_temp, 
              order = ["low", "medium", "high"], hue = "state", dodge = True).set(ylabel = "count")

[Text(0, 0.5, 'count')]


# group the data by hospitalizedLevel and state, then find the sum of observations under each group
covid_temp = covid.groupby(["hospitalizedLevel", "state"], as_index = False).size()

# facetted point plot
sns.catplot(x = "hospitalizedLevel", y = "size", data = covid_temp, 
            order = ["low", "medium", "high"], col = "state", kind = "point").set(ylabel = "count")

<seaborn.axisgrid.FacetGrid at 0x7faae7f0ba30>


plt.figure(figsize=(8, 5))

# scatter plot
sns.scatterplot(x = "hospitalizedCurrently", y = "deathIncrease", data = covid)

<AxesSubplot:xlabel='hospitalizedCurrently', ylabel='deathIncrease'>


plt.figure(figsize=(8, 5))

# scatter plot, points colored by state
sns.scatterplot(x = "hospitalizedCurrently", y = "deathIncrease", data = covid, 
                hue = "state", alpha = 0.7)

<AxesSubplot:xlabel='hospitalizedCurrently', ylabel='deathIncrease'>


plt.figure(figsize=(8, 5))

# scatter plot, shaped by hospitalizedLevel
sns.scatterplot(x = "hospitalizedCurrently", y = "deathIncrease", data = covid, 
                style = "hospitalizedLevel", alpha = 0.7)

<AxesSubplot:xlabel='hospitalizedCurrently', ylabel='deathIncrease'>


plt.figure(figsize=(8, 5))

# scatter plot, sized by totalTestResultsIncrease
sns.scatterplot(x = "hospitalizedCurrently", y = "deathIncrease", data = covid, 
                size = "totalTestResultsIncrease", alpha = 0.7)

<AxesSubplot:xlabel='hospitalizedCurrently', ylabel='deathIncrease'>


# facetted scatter plot
sns.relplot(x = "hospitalizedCurrently", y = "deathIncrease", data = covid, col = "state")

<seaborn.axisgrid.FacetGrid at 0x7faaf6f5a8e0>


plt.figure(figsize=(8, 5))

# scatter plot with linear regression fit
sns.regplot(x = "hospitalizedCurrently", y = "deathIncrease", data = covid, ci = 90)

<AxesSubplot:xlabel='hospitalizedCurrently', ylabel='deathIncrease'>


plt.figure(figsize=(8, 5))

# contour plot
sns.kdeplot(x = "hospitalizedCurrently", y = "deathIncrease", data = covid)

<AxesSubplot:xlabel='hospitalizedCurrently', ylabel='deathIncrease'>


plt.figure(figsize=(8, 5))

# contour plot with contour lines filled
sns.kdeplot(x = "hospitalizedCurrently", y = "deathIncrease", fill = True, data = covid)

<AxesSubplot:xlabel='hospitalizedCurrently', ylabel='deathIncrease'>


plt.figure(figsize=(8, 5))

# contour plot conditioned on state
sns.kdeplot(x = "hospitalizedCurrently", y = "deathIncrease", data = covid, hue = "state")

<AxesSubplot:xlabel='hospitalizedCurrently', ylabel='deathIncrease'>


plt.figure(figsize=(8, 5))

# round the columns
covid[["hcRounded"]] = covid[["hospitalizedCurrently"]].apply(lambda x: x//1000*1000, axis = 1)
covid[["diRounded"]] = covid[["deathIncrease"]].apply(lambda x: x//10*10, axis = 1)
# group by the rounded values and find the count for each combination of values
covid_rounded = covid.groupby(["hcRounded", "diRounded"], as_index = False).size()
# change from long to wide format
new = covid_rounded.pivot(index = "hcRounded", columns = "diRounded", values = "size")
# fill NaNs with zeros
new = new.fillna(0)
# keep part of the data frame (matrix) with mostly non-zero values
new = new.iloc[:10, :10]

# heat map
sns.heatmap(new, linewidths = 1, cmap = "flare")

<AxesSubplot:xlabel='diRounded', ylabel='hcRounded'>


plt.figure(figsize=(8, 5))

# keep data between 2020-04-01 to 2020-09-30
covid_subset = covid[(covid["date"] >= "2020-04-01") & (covid["date"] <= "2020-09-30")]

# line plot for time series data
sns.lineplot(x = "date", y = "deathIncrease", data = covid_subset)

<AxesSubplot:xlabel='date', ylabel='deathIncrease'>


plt.figure(figsize=(8, 5))

# line plot for each state
sns.lineplot(x = "date", y = "deathIncrease", data = covid_subset, hue = "state")

<AxesSubplot:xlabel='date', ylabel='deathIncrease'>


# facetted line plot
sns.relplot(x = "date", y = "deathIncrease", data = covid_subset, 
            col = "state", kind = "line")

<seaborn.axisgrid.FacetGrid at 0x7faaea8940d0>

	date	state	deathIncrease	hospitalizedCurrently	totalTestResultsIncrease	hospitalizedLevel
66	2020-12-31	CA	428	21449.0	232406	high
67	2020-12-30	CA	432	21433.0	248605	high
68	2020-12-29	CA	242	21240.0	245955	high
69	2020-12-28	CA	64	20642.0	301820	high
70	2020-12-27	CA	237	20059.0	380154	high

Tutorial: Data Visualization in Python with Seaborn¶

Author: Joanna Yao (xinyao@andrew.cmu.edu)¶

Introduction¶

Overview¶

1. Installing and Importing Seaborn¶

2. Loading the Data¶

3. Visualization: 1D Quantitative Data¶

(a) Histogram¶

(b) Density Curve¶

(c) Box Plot¶

(d) Strip Plot¶

(e) Swarm Plot¶

(f) Violin Plot¶

4. Visualization: Incorporating Categorical Data into 1D Quantitative Data¶

(a) Variations of Histogram¶

(b) Variations of Density Curve¶

(c) Variations of Box Plot¶

5. Visualization: 1D Categorical Data¶

(a) Bar Plot¶

(b) Pie Chart¶

(c) Point Plot¶

6. Visualization: Incorporating More Dimensions of Categorical Data¶

(a) Variations of Bar Plots¶

(b) Variations of Point Plot¶

7. Visualization: 2D Quantitative Data¶

(a) Scatter Plot¶

(b) Contour Plot¶

(c) Heat Map¶

(d) Line Plot¶

8. Higher Dimensional Data in General¶

9. Seaborn vs Matplotlib vs Other Choices¶