Tabular Data using Energy Flow Polynomials#

In this lab, we will treat jets as tabular data using energy flow polynomials (EFPs)

%pip install -q jetnet mplhep
%pip install -U matplotlib
#%pip install matplotlib==3.1.3 #might need this line... not sure
Note: you may need to restart the kernel to use updated packages.
Requirement already satisfied: matplotlib in /opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages (3.5.3)
Requirement already satisfied: cycler>=0.10 in /opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages (from matplotlib) (0.11.0)
Requirement already satisfied: pillow>=6.2.0 in /opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages (from matplotlib) (9.3.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages (from matplotlib) (1.4.4)
Requirement already satisfied: pyparsing>=2.2.1 in /opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages (from matplotlib) (3.0.9)
Requirement already satisfied: packaging>=20.0 in /opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages (from matplotlib) (21.3)
Requirement already satisfied: python-dateutil>=2.7 in /opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages (from matplotlib) (2.8.2)
Requirement already satisfied: fonttools>=4.22.0 in /opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages (from matplotlib) (4.38.0)
Requirement already satisfied: numpy>=1.17 in /opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages (from matplotlib) (1.21.6)
Requirement already satisfied: typing-extensions in /opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages (from kiwisolver>=1.0.1->matplotlib) (4.4.0)
Requirement already satisfied: six>=1.5 in /opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)
Note: you may need to restart the kernel to use updated packages.

Download dataset using JetNet library#

Download top quarks and light quark datasets using JetNet library

import jetnet

data_t = jetnet.datasets.JetNet(jet_type="t")
data_t.download_and_convert_to_pt(data_dir="./", jet_type="t")

data_q = jetnet.datasets.JetNet(jet_type="q")
data_q.download_and_convert_to_pt(data_dir="./", jet_type="q")
Downloading t dataset to .//t.hdf5
Downloading dataset
[..................................................] 1%
[█.................................................] 2%
[█.................................................] 4%
[██................................................] 5%
[██................................................] 6%
[███...............................................] 7%
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
/tmp/ipykernel_4947/1561504718.py in <module>
      1 import jetnet
      2 
----> 3 data_t = jetnet.datasets.JetNet(jet_type="t")
      4 data_t.download_and_convert_to_pt(data_dir="./", jet_type="t")
      5 

/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages/jetnet/datasets/jetnet.py in __init__(self, jet_type, data_dir, particle_features, jet_features, particle_normalisation, jet_normalisation, particle_transform, jet_transform, num_particles, split, split_fraction, seed)
     93             split,
     94             split_fraction,
---> 95             seed,
     96         )
     97 

/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages/jetnet/datasets/jetnet.py in getData(cls, jet_type, data_dir, particle_features, jet_features, num_particles, split, split_fraction, seed)
    171                 dataset_name=dname,
    172                 record_id=cls._zenodo_record_ids["150" if use_150 else "30"],
--> 173                 key=f"{dname}.hdf5",
    174             )
    175 

/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages/jetnet/datasets/utils.py in checkDownloadZenodoDataset(data_dir, dataset_name, record_id, key)
     59 
     60         print(f"Downloading {dataset_name} dataset to {file_path}")
---> 61         download_progress_bar(file_url, file_path)
     62 
     63     return file_path

/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages/jetnet/datasets/utils.py in download_progress_bar(file_url, file_dest)
     37 
     38             print("Downloading dataset")
---> 39             for data in response.iter_content(chunk_size=max(int(total / 1000), 1024 * 1024)):
     40                 downloaded += len(data)
     41                 f.write(data)

/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages/requests/models.py in generate()
    814             if hasattr(self.raw, "stream"):
    815                 try:
--> 816                     yield from self.raw.stream(chunk_size, decode_content=True)
    817                 except ProtocolError as e:
    818                     raise ChunkedEncodingError(e)

/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages/urllib3/response.py in stream(self, amt, decode_content)
    625         else:
    626             while not is_fp_closed(self._fp):
--> 627                 data = self.read(amt=amt, decode_content=decode_content)
    628 
    629                 if data:

/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages/urllib3/response.py in read(self, amt, decode_content, cache_content)
    564 
    565         with self._error_catcher():
--> 566             data = self._fp_read(amt) if not fp_closed else b""
    567             if amt is None:
    568                 flush_decoder = True

/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages/urllib3/response.py in _fp_read(self, amt)
    530         else:
    531             # StringIO doesn't like amt=None
--> 532             return self._fp.read(amt) if amt is not None else self._fp.read()
    533 
    534     def read(self, amt=None, decode_content=None, cache_content=False):

/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/http/client.py in read(self, amt)
    463             # Amount is given, implement using readinto
    464             b = bytearray(amt)
--> 465             n = self.readinto(b)
    466             return memoryview(b)[:n].tobytes()
    467         else:

/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/http/client.py in readinto(self, b)
    507         # connection, and the user is reading more bytes than will be provided
    508         # (for example, reading in 1k chunks)
--> 509         n = self.fp.readinto(b)
    510         if not n and b:
    511             # Ideally, we would raise IncompleteRead if the content-length

/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/socket.py in readinto(self, b)
    587         while True:
    588             try:
--> 589                 return self._sock.recv_into(b)
    590             except timeout:
    591                 self._timeout_occurred = True

/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/ssl.py in recv_into(self, buffer, nbytes, flags)
   1069                   "non-zero flags not allowed in calls to recv_into() on %s" %
   1070                   self.__class__)
-> 1071             return self.read(nbytes, buffer)
   1072         else:
   1073             return super().recv_into(buffer, nbytes, flags)

/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/ssl.py in read(self, len, buffer)
    927         try:
    928             if buffer is not None:
--> 929                 return self._sslobj.read(len, buffer)
    930             else:
    931                 return self._sslobj.read(len)

KeyboardInterrupt: 
import torch

n = 2000
data_t_pt = torch.load("t.pt")[:n]
data_q_pt = torch.load("q.pt")[:n]
X_jets_pt = torch.cat([data_t_pt, data_q_pt])
y_pt = torch.cat([torch.ones(n), torch.zeros(n)])
y_np = y_pt.numpy()

Calculate Energy Flow Polynomials#

X_efps_np = jetnet.utils.efps(
    X_jets_pt[:, :, :3].numpy(), efpset_args=[("n==", 4), ("d==", 4), ("p==", 1)]
)

Split data#

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_efps_np, y_np, stratify=y_np, random_state=42, test_size=0.25
)

Plot EFPs#

import matplotlib.pyplot as plt
import mplhep as hep

hep.style.use(hep.style.ROOT)
import numpy as np

plt.figure()
plt.hist(
    X_test[:, 0],
    weights=y_test,
    bins=np.arange(0, 0.004, 0.0001),
    alpha=0.7,
    label="top quark",
)
plt.hist(
    X_test[:, 0],
    weights=(1 - y_test),
    bins=np.arange(0, 0.004, 0.0001),
    alpha=0.7,
    label="light quark",
)
plt.xlabel("EFP")
plt.legend()
plt.show()

Train Boosted Decision Tree#

!pip install -q xgboost
import xgboost as xgb

clf = xgb.XGBClassifier(max_depth=3, n_estimators=10, n_jobs=-1)

clf.fit(X_train, y_train)

Evaluate BDT Performance#

from sklearn.metrics import roc_curve, auc

y_xgb = clf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_xgb)
auc_t = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label="AUC = {}".format(auc_t))
plt.legend()
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.show()