{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Week 4 Notebook: Simple Classifiers\n", "===============================================================\n", "\n", "This week, we're going to build some simple classifiers." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import tensorflow.keras as keras\n", "import numpy as np\n", "from sklearn.metrics import roc_curve, auc\n", "import matplotlib.pyplot as plt\n", "import uproot\n", "import utils" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import yaml\n", "\n", "with open('definitions_simple.yml') as file:\n", " # The FullLoader parameter handles the conversion from YAML\n", " # scalar values to Python the dictionary format\n", " definitions = yaml.load(file, Loader=yaml.FullLoader)\n", " \n", "features = definitions['features']\n", "spectators = definitions['spectators']\n", "labels = definitions['labels']\n", "\n", "nfeatures = definitions['nfeatures']\n", "nspectators = definitions['nspectators']\n", "nlabels = definitions['nlabels']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's set up a function to get features and labels." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# load training file\n", "feature_array, label_array, spec_array = utils.get_features_labels('root://eospublic.cern.ch//eos/opendata/cms/datascience/HiggsToBBNtupleProducerTool/HiggsToBBNTuple_HiggsToBB_QCD_RunII_13TeV_MC/train/ntuple_merged_10.root', \n", " features, spectators, labels,\n", " remove_mass_pt_window=False,\n", " entry_stop=20000)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Decision Tree Classifier" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn import tree\n", "clf = tree.DecisionTreeClassifier(max_depth=5)\n", "clf = clf.fit(feature_array, label_array[:,1])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Support Vector Machine Classifier" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn import linear_model\n", "svm = linear_model.SGDClassifier()\n", "svm.fit(feature_array, label_array[:,1])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Fully Connected Neural Network Classifier" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from tensorflow.keras.models import Model\n", "from tensorflow.keras.layers import Input, Dense, BatchNormalization\n", "\n", "# define dense keras model\n", "inputs = Input(shape=(nfeatures,), name = 'input') \n", "x = BatchNormalization(name='bn_1')(inputs)\n", "x = Dense(64, name = 'dense_1', activation='relu')(x)\n", "x = Dense(32, name = 'dense_2', activation='relu')(x)\n", "x = Dense(32, name = 'dense_3', activation='relu')(x)\n", "outputs = Dense(nlabels, name = 'output', activation='softmax')(x)\n", "keras_model = Model(inputs=inputs, outputs=outputs)\n", "keras_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])\n", "print(keras_model.summary())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# define callbacks\n", "from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau\n", "\n", "early_stopping = EarlyStopping(monitor='val_loss', patience=20)\n", "reduce_lr = ReduceLROnPlateau(patience=5,factor=0.5)\n", "model_checkpoint = ModelCheckpoint('keras_model_best.h5', monitor='val_loss', save_best_only=True)\n", "callbacks = [early_stopping, model_checkpoint, reduce_lr]\n", "\n", "# fit keras model\n", "history = keras_model.fit(feature_array, label_array, batch_size=1024, \n", " epochs=100, validation_split=0.2, shuffle=False,\n", " callbacks = callbacks, verbose=0)\n", "# reload best weights\n", "keras_model.load_weights('keras_model_best.h5')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure()\n", "plt.plot(history.history['loss'],label='Loss')\n", "plt.plot(history.history['val_loss'],label='Val. loss')\n", "plt.xlabel('Epoch')\n", "plt.legend()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# load testing file\n", "feature_array_test, label_array_test, spec_array_test = utils.get_features_labels('root://eospublic.cern.ch//eos/opendata/cms/datascience/HiggsToBBNtupleProducerTool/HiggsToBBNTuple_HiggsToBB_QCD_RunII_13TeV_MC/test/ntuple_merged_0.root', \n", " features, spectators, labels,\n", " remove_mass_pt_window=True,\n", " entry_stop=30000)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# run model inference on test data set\n", "predict_array_nn = keras_model.predict(feature_array_test)[:,1]\n", "predict_array_tree = clf.predict_proba(feature_array_test)[:,1]\n", "predict_array_svm = svm.decision_function(feature_array_test)\n", "\n", "# create ROC curves\n", "fpr_tree, tpr_tree, threshold_tree = roc_curve(label_array_test[:,1], predict_array_tree)\n", "fpr_svm, tpr_svm, threshold_svm = roc_curve(label_array_test[:,1], predict_array_svm)\n", "fpr_nn, tpr_nn, threshold_nn = roc_curve(label_array_test[:,1], predict_array_nn)\n", " \n", "# plot ROC curves\n", "plt.figure()\n", "plt.plot(tpr_tree, fpr_tree, lw=2.5, label=\"Tree, AUC = {:.1f}%\".format(auc(fpr_tree,tpr_tree)*100))\n", "plt.plot(tpr_svm, fpr_svm, lw=2.5, label=\"SVM, AUC = {:.1f}%\".format(auc(fpr_svm,tpr_svm)*100))\n", "plt.plot(tpr_nn, fpr_nn, lw=2.5, label=\"NN, AUC = {:.1f}%\".format(auc(fpr_nn,tpr_nn)*100))\n", "plt.xlabel(r'True positive rate')\n", "plt.ylabel(r'False positive rate')\n", "plt.semilogy()\n", "plt.ylim(0.001,1)\n", "plt.xlim(0,1)\n", "plt.grid(True)\n", "plt.legend(loc='upper left')\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Try to add a boosted decision tree." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.4" } }, "nbformat": 4, "nbformat_minor": 2 }