Tabular Data using Energy Flow Polynomials
Contents
Tabular Data using Energy Flow Polynomials#
In this lab, we will treat jets as tabular data using energy flow polynomials (EFPs)
%pip install -q jetnet mplhep
%pip install -U matplotlib
#%pip install matplotlib==3.1.3 #might need this line... not sure
Note: you may need to restart the kernel to use updated packages.
Requirement already satisfied: matplotlib in /opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages (3.5.3)
Requirement already satisfied: cycler>=0.10 in /opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages (from matplotlib) (0.11.0)
Requirement already satisfied: pillow>=6.2.0 in /opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages (from matplotlib) (9.3.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages (from matplotlib) (1.4.4)
Requirement already satisfied: pyparsing>=2.2.1 in /opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages (from matplotlib) (3.0.9)
Requirement already satisfied: packaging>=20.0 in /opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages (from matplotlib) (21.3)
Requirement already satisfied: python-dateutil>=2.7 in /opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages (from matplotlib) (2.8.2)
Requirement already satisfied: fonttools>=4.22.0 in /opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages (from matplotlib) (4.38.0)
Requirement already satisfied: numpy>=1.17 in /opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages (from matplotlib) (1.21.6)
Requirement already satisfied: typing-extensions in /opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages (from kiwisolver>=1.0.1->matplotlib) (4.4.0)
Requirement already satisfied: six>=1.5 in /opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
Download dataset using JetNet library#
Download top quarks and light quark datasets using JetNet library
import jetnet
data_t = jetnet.datasets.JetNet(jet_type="t")
data_t.download_and_convert_to_pt(data_dir="./", jet_type="t")
data_q = jetnet.datasets.JetNet(jet_type="q")
data_q.download_and_convert_to_pt(data_dir="./", jet_type="q")
Downloading t dataset to .//t.hdf5
Downloading dataset
[..................................................] 1%
[█.................................................] 2%
[█.................................................] 4%
[██................................................] 5%
[██................................................] 6%
[███...............................................] 7%
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
/tmp/ipykernel_4947/1561504718.py in <module>
1 import jetnet
2
----> 3 data_t = jetnet.datasets.JetNet(jet_type="t")
4 data_t.download_and_convert_to_pt(data_dir="./", jet_type="t")
5
/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages/jetnet/datasets/jetnet.py in __init__(self, jet_type, data_dir, particle_features, jet_features, particle_normalisation, jet_normalisation, particle_transform, jet_transform, num_particles, split, split_fraction, seed)
93 split,
94 split_fraction,
---> 95 seed,
96 )
97
/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages/jetnet/datasets/jetnet.py in getData(cls, jet_type, data_dir, particle_features, jet_features, num_particles, split, split_fraction, seed)
171 dataset_name=dname,
172 record_id=cls._zenodo_record_ids["150" if use_150 else "30"],
--> 173 key=f"{dname}.hdf5",
174 )
175
/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages/jetnet/datasets/utils.py in checkDownloadZenodoDataset(data_dir, dataset_name, record_id, key)
59
60 print(f"Downloading {dataset_name} dataset to {file_path}")
---> 61 download_progress_bar(file_url, file_path)
62
63 return file_path
/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages/jetnet/datasets/utils.py in download_progress_bar(file_url, file_dest)
37
38 print("Downloading dataset")
---> 39 for data in response.iter_content(chunk_size=max(int(total / 1000), 1024 * 1024)):
40 downloaded += len(data)
41 f.write(data)
/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages/requests/models.py in generate()
814 if hasattr(self.raw, "stream"):
815 try:
--> 816 yield from self.raw.stream(chunk_size, decode_content=True)
817 except ProtocolError as e:
818 raise ChunkedEncodingError(e)
/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages/urllib3/response.py in stream(self, amt, decode_content)
625 else:
626 while not is_fp_closed(self._fp):
--> 627 data = self.read(amt=amt, decode_content=decode_content)
628
629 if data:
/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages/urllib3/response.py in read(self, amt, decode_content, cache_content)
564
565 with self._error_catcher():
--> 566 data = self._fp_read(amt) if not fp_closed else b""
567 if amt is None:
568 flush_decoder = True
/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages/urllib3/response.py in _fp_read(self, amt)
530 else:
531 # StringIO doesn't like amt=None
--> 532 return self._fp.read(amt) if amt is not None else self._fp.read()
533
534 def read(self, amt=None, decode_content=None, cache_content=False):
/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/http/client.py in read(self, amt)
463 # Amount is given, implement using readinto
464 b = bytearray(amt)
--> 465 n = self.readinto(b)
466 return memoryview(b)[:n].tobytes()
467 else:
/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/http/client.py in readinto(self, b)
507 # connection, and the user is reading more bytes than will be provided
508 # (for example, reading in 1k chunks)
--> 509 n = self.fp.readinto(b)
510 if not n and b:
511 # Ideally, we would raise IncompleteRead if the content-length
/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/socket.py in readinto(self, b)
587 while True:
588 try:
--> 589 return self._sock.recv_into(b)
590 except timeout:
591 self._timeout_occurred = True
/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/ssl.py in recv_into(self, buffer, nbytes, flags)
1069 "non-zero flags not allowed in calls to recv_into() on %s" %
1070 self.__class__)
-> 1071 return self.read(nbytes, buffer)
1072 else:
1073 return super().recv_into(buffer, nbytes, flags)
/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/ssl.py in read(self, len, buffer)
927 try:
928 if buffer is not None:
--> 929 return self._sslobj.read(len, buffer)
930 else:
931 return self._sslobj.read(len)
KeyboardInterrupt:
import torch
n = 2000
data_t_pt = torch.load("t.pt")[:n]
data_q_pt = torch.load("q.pt")[:n]
X_jets_pt = torch.cat([data_t_pt, data_q_pt])
y_pt = torch.cat([torch.ones(n), torch.zeros(n)])
y_np = y_pt.numpy()
Calculate Energy Flow Polynomials#
X_efps_np = jetnet.utils.efps(
X_jets_pt[:, :, :3].numpy(), efpset_args=[("n==", 4), ("d==", 4), ("p==", 1)]
)
Split data#
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X_efps_np, y_np, stratify=y_np, random_state=42, test_size=0.25
)
Plot EFPs#
import matplotlib.pyplot as plt
import mplhep as hep
hep.style.use(hep.style.ROOT)
import numpy as np
plt.figure()
plt.hist(
X_test[:, 0],
weights=y_test,
bins=np.arange(0, 0.004, 0.0001),
alpha=0.7,
label="top quark",
)
plt.hist(
X_test[:, 0],
weights=(1 - y_test),
bins=np.arange(0, 0.004, 0.0001),
alpha=0.7,
label="light quark",
)
plt.xlabel("EFP")
plt.legend()
plt.show()
Train Boosted Decision Tree#
!pip install -q xgboost
import xgboost as xgb
clf = xgb.XGBClassifier(max_depth=3, n_estimators=10, n_jobs=-1)
clf.fit(X_train, y_train)
Evaluate BDT Performance#
from sklearn.metrics import roc_curve, auc
y_xgb = clf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_xgb)
auc_t = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, label="AUC = {}".format(auc_t))
plt.legend()
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.show()