In [60]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import GridSearchCV

# Evaluation
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error,mean_squared_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


#classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

#split
from sklearn.model_selection import train_test_split
# import warnings

load data¶

In [61]:
df = pd.read_csv('../data/train.csv')
X=df.drop(columns=['Survived','Name','Ticket','Cabin'],axis=1)
y = df['Survived']

transform data¶

In [62]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

num_pipeline= Pipeline(
    steps=[
    ("imputer",SimpleImputer(strategy="median")),
    ("scaler",StandardScaler())

    ]
)
cat_pipeline=Pipeline(

    steps=[
    ("imputer",SimpleImputer(strategy="most_frequent")),
    ("one_hot_encoder",OneHotEncoder()),
    ("scaler",StandardScaler(with_mean=False))
    ]
)
preprocessor=ColumnTransformer(
    [
    ("num_pipeline",num_pipeline,num_features),
    ("cat_pipelines",cat_pipeline,cat_features)

    ]
)

X = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

load model¶

In [ ]:
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    f1 = f1_score(true, predicted)
    return accuracy, precision, recall, f1

models = {
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(), 
    "naive_bayes": GaussianNB(),
    "neural_network": MLPClassifier()
}
params={
    "Logistic Regression": {'max_iter':[100,500,1000]},
    "K-Neighbors": {'weights' : ['uniform', 'distance']},
    "Decision Tree": {'splitter' : ["best", "random"]},
    "Random Forest": {'criterion' : ["gini", "entropy", "log_loss"]},
    "XGBClassifier": {}, 
    "naive_bayes": {},
    "neural_network": {'alpha' : [0.0001,0.001,0.01]}
}
model_list = []
f1_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    param=params[list(models.keys())[i]]

    gs = GridSearchCV(model,param,cv=3)
    gs.fit(X_train,y_train)
    model.set_params(**gs.best_params_)
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_accuracy , model_train_precision, model_train_recall,model_train_f1 = evaluate_model(y_train, y_train_pred)

    model_test_accuracy , model_test_precision, model_test_recall,model_test_f1 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- accuracy: {:.4f}".format(model_train_accuracy))
    print("- precision: {:.4f}".format(model_train_precision))
    print("- recall: {:.4f}".format(model_train_recall))
    print("- f1: {:.4f}".format(model_train_f1))

    print('----------------------------------')
    
    print('Model performance for Training set')
    print("- accuracy: {:.4f}".format(model_test_accuracy))
    print("- precision: {:.4f}".format(model_test_precision))
    print("- recall: {:.4f}".format(model_test_recall))
    print("- f1: {:.4f}".format(model_test_f1))

    f1_list.append(model_test_f1)
    
    print('='*35)
    print('\n')

Performance rank of different models¶

In [64]:
result=np.c_[model_list,f1_list]
sorted_indices = np.argsort(result[:, 1])[::-1]
result=result[sorted_indices]
result
Out[64]:
array([['Random Forest', '0.7916666666666666'],
       ['Logistic Regression', '0.7586206896551724'],
       ['neural_network', '0.7536231884057971'],
       ['K-Neighbors', '0.75'],
       ['naive_bayes', '0.7435897435897436'],
       ['XGBClassifier', '0.7346938775510204'],
       ['Decision Tree', '0.7019867549668874']], dtype='<U32')