In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Types of Plots

In [2]:
import warnings
warnings.filterwarnings("ignore")
In [3]:
url = 'http://bit.ly/2b72LNj'
df = pd.read_csv(url)
In [4]:
df.head()
Out[4]:
model mpg cyl disp hp drat wt qsec vs am gear carb
0 Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
1 Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
2 Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
3 Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
4 Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2

Scatter plot

In [5]:
plt.scatter(x = 'wt', y = 'mpg', data = df)
pass
_images/Basic_Plots_Solutions_6_0.png

Line plot

In [6]:
plt.plot('wt', 'mpg', '-o', data = df.sort_values('wt'))
pass
_images/Basic_Plots_Solutions_8_0.png

Fitting a regression line

Linear regression

In [7]:
sns.regplot('wt', 'mpg', data = df)
pass
_images/Basic_Plots_Solutions_11_0.png

Non-parametric regression

In [8]:
sns.regplot('wt', 'mpg', data = df, lowess = True,)
pass
_images/Basic_Plots_Solutions_13_0.png

Swarm plot

In [9]:
sns.swarmplot('gear', 'mpg', data = df)
pass
_images/Basic_Plots_Solutions_15_0.png

Bar plot

In [10]:
sns.barplot('gear', 'mpg', data = df)
pass
_images/Basic_Plots_Solutions_17_0.png

Box plot

In [11]:
sns.boxplot('gear', 'mpg', data = df)
pass
_images/Basic_Plots_Solutions_19_0.png

Violin plot

In [12]:
sns.violinplot('gear', 'mpg', data = df)
pass
_images/Basic_Plots_Solutions_21_0.png

Histogram

In [13]:
sns.distplot(df.mpg, kde=False, rug=True)
pass
_images/Basic_Plots_Solutions_23_0.png

Density plot

In [14]:
sns.distplot(df.mpg)
pass
_images/Basic_Plots_Solutions_25_0.png

Heatmap

In [15]:
import string
In [16]:
nrows = 20
ncols = 20
row_names = range(1970, 1970 + nrows)
col_names = [string.ascii_lowercase[i:i+5] for i in range(ncols)]
values = np.random.random((nrows, ncols))
In [17]:
df = pd.DataFrame(values, index = row_names, columns = col_names)
In [18]:
sns.heatmap(df)
pass
_images/Basic_Plots_Solutions_30_0.png
In [19]:
cmap = sns.light_palette('red', as_cmap=True)
g = sns.clustermap(df, cmap = cmap)
plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
pass
_images/Basic_Plots_Solutions_31_0.png

Exercises

In [20]:
subjects = pd.read_excel('data/capstone1/HIV Boot Camp.xls', sheetname='Patient Information')
In [21]:
subjects.head()
Out[21]:
ID AGE RACE SEX MSM IDU UNC Entry to Care Year Prior HIV Care Elsewhere On ART ART Type
0 1 38 AMIND F No No 2012 NO Yes Integrase
1 2 44 AMIND F No No 2006 YES No NaN
2 3 33 AMIND F No No 2005 NO No NaN
3 4 60 AMIND F No Yes 2003 NO Yes Integrase
4 6 47 AMIND F No No 2001 YES Yes Other

1. Make a linear regression plot of AGE by UNC Entry to Care Year. Is there any trend over time that you can observe?

In [22]:
sns.regplot(data=subjects, y='AGE', x='UNC Entry to Care Year')
pass
_images/Basic_Plots_Solutions_36_0.png

2. Chcek the help documentaion for sns.countplot. Then create a barchart of the count of the number of people by SEX.

In [23]:
sns.countplot(data=subjects, x='SEX')
pass
_images/Basic_Plots_Solutions_38_0.png

3. Make a horizontal stacked barchart of the count of the number of people by RACE and color by SEX. As before, look up help on sns.countplot to do so.

In [24]:
sns.countplot(data=subjects, y='RACE', hue='SEX')
pass
_images/Basic_Plots_Solutions_40_0.png

4. Make a barchart of the AGE of people by RACE.

In [25]:
sns.barplot(data=subjects, x='RACE', y='AGE')
pass
_images/Basic_Plots_Solutions_42_0.png

5. Make a barchart of the AGE of people by RACE, but arrange in order of increasing average age. To do so, you need to give the order argument a list of strings containing the sequnce of RACE desired.

Bonus: (More challenging) Try to get the list of strings using pandas rather than manually.

In [26]:
sns.barplot(data=subjects, x='RACE', y='AGE',
            order=subjects.groupby('RACE').mean().sort_values('AGE').index)
pass
_images/Basic_Plots_Solutions_44_0.png

6 Make a swarm plot of the AGE by RACE but only for subejcts who have MSM status equal to ‘Yes’.

In [27]:
sns.swarmplot(data=subjects[subjects.MSM=='Yes'], x='RACE', y='AGE')
pass
_images/Basic_Plots_Solutions_46_0.png

7. (Challenging) Make a heat map of the mean AGE where the rows correspond to RACe and the columns to ‘UNC Entry to Care Year’.

Hint: Look up the pivot_table method of pandas DataFrames.

In [28]:
data = subjects.pivot_table(index='RACE', columns='UNC Entry to Care Year', values='AGE')
In [29]:
sns.heatmap(data)
pass
_images/Basic_Plots_Solutions_49_0.png
In [ ]: