We will be using a database with 297 datapoints and the following attributes:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import seaborn as sns
# load the data
df = pd.read_csv('heart.csv',names = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','target'])
df.head()
df.describe()
Decision Tree is not a similarity-based algorithm, but an information-based algorithm, hence there is no need to scale the data.
df.info()
df.groupby('target').mean()
By grouping by our target variable and taking the average of each attribute, we can see there are some noticeable differences between people with a diagnosed heart disease and without.
We can try to visualize these distribution differences with pairplots and using as color hue our target variable
sns.pairplot(df[['age','sex','cp','trestbps','chol','fbs','target']], hue="target")
sns.pairplot(df[['restecg','thalach','exang','oldpeak','slope','ca','thal','target']], hue="target")
# shuffle
df = shuffle(df).reset_index(drop=True)
# divide attributes between output variable (y) and explanatory variables (x)
df_x = df.drop(['target'], axis=1)
df_y = df['target']
# split between traing and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.3, random_state=1)
# fit decission tree on train set
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
# get performance of our decission tree with the test set
prediction = dtc.predict(X_test)
#Score
score_tree = accuracy_score(y_test, prediction)
print("Accuracy score: " + str( score_tree))
# visualize decission tree
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (10,10), dpi=300)
tree.plot_tree(dtc, filled=True,max_depth=3)
The tree seems to be too long, this could create a problem of overfitting. We will try to change some parameters and to "prune" it to make it shorter
We will be trying different criterions and we will prune the tree to improve accuracy
### try differents criterions to measure the quality of the split
# gini criterion
dtc = DecisionTreeClassifier(criterion='gini')
dtc.fit(X_train, y_train)
pred = dtc.predict(X_test)
print('Criterion = gini', accuracy_score(y_test, pred))
# entropy criterion
dtc = DecisionTreeClassifier(criterion='entropy')
dtc.fit(X_train, y_train)
pred = dtc.predict(X_test)
print('Criterion = entropy', accuracy_score(y_test, pred))
Gini criterion gives us a higher accuracy
To choose the best hyperparameters ('max_depth' and 'min_samples_leaf') we will be using GridSearch.
# We will vary hyperparameters from 2 to 10
best_parameters = {'max_depth': np.arange(2,11), 'min_samples_leaf': np.arange(2,11)}
# choose gini criterion
dtc = DecisionTreeClassifier(criterion='entropy')
model = GridSearchCV(estimator=dtc, param_grid=best_parameters, n_jobs=-1, verbose=1, cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=17))
model.fit(X_train, y_train)
# get best parameters
model.best_params_
# check improved accuracy with new hyperparameters
pred = model.predict(X_test)
accuracy_score(y_test, pred)
# we visualize the improved tree
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (10,10), dpi=300)
tree.plot_tree(model.best_estimator_, filled=True,max_depth=3)
Note: we could keep improving this algorithm with more sophisticated methods, such as Random Forests, but the objective of this exercise was to only work with Decision Trees