Training model
In [1]:
Copied!
%reload_ext autoreload
%autoreload 2
%reload_ext autoreload
%autoreload 2
In [2]:
Copied!
import numpy as np
import logging
from polymetrix.datasets.curated_tg_dataset import CuratedGlassTempDataset
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
from mofdscribe.splitters.splitters import LOCOCV
from polymetrix.splitters.splitters import TgSplitter
import numpy as np
import logging
from polymetrix.datasets.curated_tg_dataset import CuratedGlassTempDataset
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
from mofdscribe.splitters.splitters import LOCOCV
from polymetrix.splitters.splitters import TgSplitter
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[2], line 3 1 import numpy as np 2 import logging ----> 3 from polymetrix.datasets.curated_tg_dataset import CuratedGlassTempDataset 4 from sklearn.ensemble import GradientBoostingRegressor 5 from sklearn.metrics import mean_absolute_error File ~/work/PolyMetriX/PolyMetriX/src/polymetrix/datasets/__init__.py:2 1 from .dataset import AbstractDataset ----> 2 from .curated_tg_dataset import CuratedGlassTempDataset 4 __all__ = ['AbstractDataset', 'CuratedGlassTempDataset'] File ~/work/PolyMetriX/PolyMetriX/src/polymetrix/datasets/curated_tg_dataset.py:4 2 from collections.abc import Collection 3 from typing import Optional, List ----> 4 from polymetrix.constants import POLYMETRIX_PYSTOW_MODULE 5 from polymetrix.datasets import AbstractDataset 8 class CuratedGlassTempDataset(AbstractDataset): ModuleNotFoundError: No module named 'polymetrix.constants'
In [3]:
Copied!
# Configuration
RANDOM_STATE = 42
# Initialize logging
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
# Load dataset
dataset = CuratedGlassTempDataset(
feature_levels=["sidechainlevel", "backbonelevel", "fullpolymerlevel"]
)
# Extract features and labels
X = dataset.get_features(idx=np.arange(len(dataset)))
y = dataset.get_labels(idx=np.arange(len(dataset)), label_names=["labels.Exp_Tg(K)"]).ravel()
# Dataset info logging
logging.info(f"Number of samples: {len(dataset)}")
logging.info(f"Feature columns: {dataset.available_features}")
logging.info(f"Active feature levels: {dataset.active_feature_levels}")
# Configuration
RANDOM_STATE = 42
# Initialize logging
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
# Load dataset
dataset = CuratedGlassTempDataset(
feature_levels=["sidechainlevel", "backbonelevel", "fullpolymerlevel"]
)
# Extract features and labels
X = dataset.get_features(idx=np.arange(len(dataset)))
y = dataset.get_labels(idx=np.arange(len(dataset)), label_names=["labels.Exp_Tg(K)"]).ravel()
# Dataset info logging
logging.info(f"Number of samples: {len(dataset)}")
logging.info(f"Feature columns: {dataset.available_features}")
logging.info(f"Active feature levels: {dataset.active_feature_levels}")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[3], line 8 5 logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") 7 # Load dataset ----> 8 dataset = CuratedGlassTempDataset( 9 feature_levels=["sidechainlevel", "backbonelevel", "fullpolymerlevel"] 10 ) 12 # Extract features and labels 13 X = dataset.get_features(idx=np.arange(len(dataset))) NameError: name 'CuratedGlassTempDataset' is not defined
Evaluation and modeling functions¶
In [4]:
Copied!
def train_and_evaluate(X_train, X_test, y_train, y_test):
model = GradientBoostingRegressor(random_state=RANDOM_STATE)
model.fit(X_train, y_train)
preds = model.predict(X_test)
return mean_absolute_error(y_test, preds)
def log_splits(X_train, X_valid, X_test):
"""Log split sizes"""
logging.info(f"Training set: {len(X_train)} samples")
logging.info(f"Validation set: {len(X_valid) if X_valid is not None else 0} samples")
logging.info(f"Test set: {len(X_test)} samples")
def train_and_evaluate(X_train, X_test, y_train, y_test):
model = GradientBoostingRegressor(random_state=RANDOM_STATE)
model.fit(X_train, y_train)
preds = model.predict(X_test)
return mean_absolute_error(y_test, preds)
def log_splits(X_train, X_valid, X_test):
"""Log split sizes"""
logging.info(f"Training set: {len(X_train)} samples")
logging.info(f"Validation set: {len(X_valid) if X_valid is not None else 0} samples")
logging.info(f"Test set: {len(X_test)} samples")
1: Random Split¶
Traditional train/valid/test split
In [5]:
Copied!
# Random split
X_train, X_temp, y_train, y_temp = train_test_split(
X, y, test_size=0.3, random_state=RANDOM_STATE
)
X_valid, X_test, y_valid, y_test = train_test_split(
X_temp, y_temp, test_size=0.5, random_state=RANDOM_STATE
)
log_splits(X_train, X_valid, X_test)
# Evaluation
valid_mae = train_and_evaluate(X_train, X_valid, y_train, y_valid)
test_mae = train_and_evaluate(X_train, X_test, y_train, y_test)
logging.info(f"Validation MAE: {valid_mae:.2f}, Test MAE: {test_mae:.2f}")
# Random split
X_train, X_temp, y_train, y_temp = train_test_split(
X, y, test_size=0.3, random_state=RANDOM_STATE
)
X_valid, X_test, y_valid, y_test = train_test_split(
X_temp, y_temp, test_size=0.5, random_state=RANDOM_STATE
)
log_splits(X_train, X_valid, X_test)
# Evaluation
valid_mae = train_and_evaluate(X_train, X_valid, y_train, y_valid)
test_mae = train_and_evaluate(X_train, X_test, y_train, y_test)
logging.info(f"Validation MAE: {valid_mae:.2f}, Test MAE: {test_mae:.2f}")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[5], line 2 1 # Random split ----> 2 X_train, X_temp, y_train, y_temp = train_test_split( 3 X, y, test_size=0.3, random_state=RANDOM_STATE 4 ) 5 X_valid, X_test, y_valid, y_test = train_test_split( 6 X_temp, y_temp, test_size=0.5, random_state=RANDOM_STATE 7 ) 9 log_splits(X_train, X_valid, X_test) NameError: name 'train_test_split' is not defined
Random kfold split¶
In [6]:
Copied!
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
cv_scores = []
for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
fold_mae = train_and_evaluate(X[train_idx], X[test_idx], y[train_idx], y[test_idx])
cv_scores.append(fold_mae)
logging.info(f"Fold {fold} MAE: {fold_mae:.2f}")
logging.info(f"CV MAE: {np.mean(cv_scores):.2f} ± {np.std(cv_scores):.2f}")
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
cv_scores = []
for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
fold_mae = train_and_evaluate(X[train_idx], X[test_idx], y[train_idx], y[test_idx])
cv_scores.append(fold_mae)
logging.info(f"Fold {fold} MAE: {fold_mae:.2f}")
logging.info(f"CV MAE: {np.mean(cv_scores):.2f} ± {np.std(cv_scores):.2f}")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[6], line 1 ----> 1 kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE) 2 cv_scores = [] 4 for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1): NameError: name 'KFold' is not defined
2: Leave-cluster-out cross-validation¶
In [7]:
Copied!
loco = LOCOCV(
ds=dataset,
feature_names=dataset.available_features,
n_pca_components=3,
random_state=RANDOM_STATE,
scaled=True
)
# Single split
train_idx, valid_idx, test_idx = loco.train_valid_test_split()
log_splits(X[train_idx], X[valid_idx], X[test_idx])
# Evaluation
valid_mae = train_and_evaluate(X[train_idx], X[valid_idx], y[train_idx], y[valid_idx])
test_mae = train_and_evaluate(X[train_idx], X[test_idx], y[train_idx], y[test_idx])
logging.info(f"LOCOCV MAE: Valid {valid_mae:.2f}, Test {test_mae:.2f}")
loco = LOCOCV(
ds=dataset,
feature_names=dataset.available_features,
n_pca_components=3,
random_state=RANDOM_STATE,
scaled=True
)
# Single split
train_idx, valid_idx, test_idx = loco.train_valid_test_split()
log_splits(X[train_idx], X[valid_idx], X[test_idx])
# Evaluation
valid_mae = train_and_evaluate(X[train_idx], X[valid_idx], y[train_idx], y[valid_idx])
test_mae = train_and_evaluate(X[train_idx], X[test_idx], y[train_idx], y[test_idx])
logging.info(f"LOCOCV MAE: Valid {valid_mae:.2f}, Test {test_mae:.2f}")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[7], line 1 ----> 1 loco = LOCOCV( 2 ds=dataset, 3 feature_names=dataset.available_features, 4 n_pca_components=3, 5 random_state=RANDOM_STATE, 6 scaled=True 7 ) 9 # Single split 10 train_idx, valid_idx, test_idx = loco.train_valid_test_split() NameError: name 'LOCOCV' is not defined
2: Leave-cluster-out cross-validation¶
kfold split based on cluster
In [8]:
Copied!
# LOCOCV 5-Fold
loco_cv = LOCOCV(
ds=dataset,
feature_names=dataset.available_features,
n_pca_components=5, # For 5-fold CV
random_state=RANDOM_STATE,
scaled=True
)
cv_scores = []
for fold, (train_idx, test_idx) in enumerate(loco_cv.k_fold(k=5), 1):
fold_mae = train_and_evaluate(X[train_idx], X[test_idx], y[train_idx], y[test_idx])
cv_scores.append(fold_mae)
logging.info(f"LOCOCV Fold {fold} MAE: {fold_mae:.2f}")
logging.info(f"LOCOCV 5-Fold MAE: {np.mean(cv_scores):.2f} ± {np.std(cv_scores):.2f}")
# LOCOCV 5-Fold
loco_cv = LOCOCV(
ds=dataset,
feature_names=dataset.available_features,
n_pca_components=5, # For 5-fold CV
random_state=RANDOM_STATE,
scaled=True
)
cv_scores = []
for fold, (train_idx, test_idx) in enumerate(loco_cv.k_fold(k=5), 1):
fold_mae = train_and_evaluate(X[train_idx], X[test_idx], y[train_idx], y[test_idx])
cv_scores.append(fold_mae)
logging.info(f"LOCOCV Fold {fold} MAE: {fold_mae:.2f}")
logging.info(f"LOCOCV 5-Fold MAE: {np.mean(cv_scores):.2f} ± {np.std(cv_scores):.2f}")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[8], line 2 1 # LOCOCV 5-Fold ----> 2 loco_cv = LOCOCV( 3 ds=dataset, 4 feature_names=dataset.available_features, 5 n_pca_components=5, # For 5-fold CV 6 random_state=RANDOM_STATE, 7 scaled=True 8 ) 10 cv_scores = [] 11 for fold, (train_idx, test_idx) in enumerate(loco_cv.k_fold(k=5), 1): NameError: name 'LOCOCV' is not defined
Tgsplitter¶
In [9]:
Copied!
tg_splitter = TgSplitter(
ds=dataset,
tg_q=np.linspace(0, 1, 5),
shuffle=True,
random_state=RANDOM_STATE
)
# Single split
train_idx, valid_idx, test_idx = tg_splitter.train_valid_test_split(
frac_train=0.7,
frac_valid=0.1
)
log_splits(X[train_idx], X[valid_idx], X[test_idx])
# Evaluation
valid_mae = train_and_evaluate(X[train_idx], X[valid_idx], y[train_idx], y[valid_idx])
test_mae = train_and_evaluate(X[train_idx], X[test_idx], y[train_idx], y[test_idx])
logging.info(f"TgSplitter MAE: Valid {valid_mae:.2f}, Test {test_mae:.2f}")
tg_splitter = TgSplitter(
ds=dataset,
tg_q=np.linspace(0, 1, 5),
shuffle=True,
random_state=RANDOM_STATE
)
# Single split
train_idx, valid_idx, test_idx = tg_splitter.train_valid_test_split(
frac_train=0.7,
frac_valid=0.1
)
log_splits(X[train_idx], X[valid_idx], X[test_idx])
# Evaluation
valid_mae = train_and_evaluate(X[train_idx], X[valid_idx], y[train_idx], y[valid_idx])
test_mae = train_and_evaluate(X[train_idx], X[test_idx], y[train_idx], y[test_idx])
logging.info(f"TgSplitter MAE: Valid {valid_mae:.2f}, Test {test_mae:.2f}")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[9], line 1 ----> 1 tg_splitter = TgSplitter( 2 ds=dataset, 3 tg_q=np.linspace(0, 1, 5), 4 shuffle=True, 5 random_state=RANDOM_STATE 6 ) 8 # Single split 9 train_idx, valid_idx, test_idx = tg_splitter.train_valid_test_split( 10 frac_train=0.7, 11 frac_valid=0.1 12 ) NameError: name 'TgSplitter' is not defined
Tgsplitter¶
kfold split
In [10]:
Copied!
# TgSplitter Grouped K-Fold
tg_splitter_cv = TgSplitter(
ds=dataset,
tg_q=np.linspace(0, 1, 6), # 5 groups for 5-fold
shuffle=True,
random_state=RANDOM_STATE
)
groups = tg_splitter_cv._get_groups()
unique_groups = np.unique(groups)
cv_scores = []
for fold, test_group in enumerate(unique_groups, 1):
train_mask = groups != test_group
test_mask = groups == test_group
fold_mae = train_and_evaluate(X[train_mask], X[test_mask], y[train_mask], y[test_mask])
cv_scores.append(fold_mae)
logging.info(f"TgSplitter Fold {fold} MAE: {fold_mae:.2f}")
logging.info(f"TgSplitter 5-Fold MAE: {np.mean(cv_scores):.2f} ± {np.std(cv_scores):.2f}")
# TgSplitter Grouped K-Fold
tg_splitter_cv = TgSplitter(
ds=dataset,
tg_q=np.linspace(0, 1, 6), # 5 groups for 5-fold
shuffle=True,
random_state=RANDOM_STATE
)
groups = tg_splitter_cv._get_groups()
unique_groups = np.unique(groups)
cv_scores = []
for fold, test_group in enumerate(unique_groups, 1):
train_mask = groups != test_group
test_mask = groups == test_group
fold_mae = train_and_evaluate(X[train_mask], X[test_mask], y[train_mask], y[test_mask])
cv_scores.append(fold_mae)
logging.info(f"TgSplitter Fold {fold} MAE: {fold_mae:.2f}")
logging.info(f"TgSplitter 5-Fold MAE: {np.mean(cv_scores):.2f} ± {np.std(cv_scores):.2f}")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[10], line 2 1 # TgSplitter Grouped K-Fold ----> 2 tg_splitter_cv = TgSplitter( 3 ds=dataset, 4 tg_q=np.linspace(0, 1, 6), # 5 groups for 5-fold 5 shuffle=True, 6 random_state=RANDOM_STATE 7 ) 9 groups = tg_splitter_cv._get_groups() 10 unique_groups = np.unique(groups) NameError: name 'TgSplitter' is not defined