Skip to content

API Reference

datasets

AbstractDataset

Bases: ABC

Base class for polymer datasets.

Source code in src/polymetrix/datasets/dataset.py
class AbstractDataset(ABC):
    """Base class for polymer datasets."""

    def __init__(self):
        """Initialize a dataset."""
        self._meta_data = None
        self._features = None
        self._labels = None
        self._psmiles = None
        self._feature_names = []
        self._label_names = []
        self._meta_names = []

    @abstractmethod
    def _load_data(self, subset: Optional[Collection[int]] = None):
        """Load and prepare the dataset-specific data.

        Args:
            subset (Optional[Collection[int]]): Indices to include in the dataset.
        """
        pass

    def get_subset(self, indices: Collection[int]) -> "AbstractDataset":
        """Get a subset of the dataset."""
        if not all(0 <= i < len(self) for i in indices):
            raise IndexError("Indices out of bounds.")
        subset = self.__class__()
        subset._features = self._features[indices]
        subset._labels = self._labels[indices]
        subset._meta_data = self._meta_data[indices]
        subset._psmiles = self._psmiles[indices] if self._psmiles is not None else None
        subset._feature_names = self._feature_names.copy()
        subset._label_names = self._label_names.copy()
        subset._meta_names = self._meta_names.copy()
        return subset

    @property
    def available_features(self) -> list[str]:
        """List of available features.
        Returns:
            list[str]: List of feature names
        """
        return self._feature_names

    @property
    def available_labels(self) -> list[str]:
        """List of available labels.
        Returns:
            list[str]: List of label names
        """
        return self._label_names

    @property
    def meta_info(self) -> list[str]:
        """List of available metadata fields.
        Returns:
            list[str]: List of metadata field names
        """
        return self._meta_names

    @property
    def psmiles(self) -> np.ndarray:
        """Return the polymer SMILES strings.
        Returns:
            np.ndarray: Array of polymer SMILES strings
        """
        return self._psmiles

    def __len__(self):
        """Return the number of entries in the dataset."""
        return len(self._features) if self._features is not None else 0

    def __iter__(self):
        """Iterate over the features in the dataset."""
        return iter(self._features)

    def get_features(
        self, idx: Collection[int], feature_names: Optional[Collection[str]] = None
    ) -> np.ndarray:
        """Get features for specified indices.
        Args:
            idx (Collection[int]): Indices of entries.
            feature_names (Optional[Collection[str]]): Names of features to return.
            If None, returns all available features.
        Returns:
            np.ndarray: Array of feature values.
        """
        if feature_names is None:
            return self._features[np.array(idx)]
        col_indices = [self._feature_names.index(name) for name in feature_names]
        return self._features[np.array(idx)][:, col_indices]

    def get_labels(
        self, idx: Collection[int], label_names: Optional[Collection[str]] = None
    ) -> np.ndarray:
        """Get labels for specified indices.
        Args:
            idx (Collection[int]): Indices of entries.
            label_names (Optional[Collection[str]]): Names of labels to return.
            If None, returns all available labels.
        Returns:
            np.ndarray: Array of label values.
        """
        if label_names is None:
            return self._labels[np.array(idx)]
        col_indices = [self._label_names.index(name) for name in label_names]
        return self._labels[np.array(idx)][:, col_indices]

    def get_meta(
        self, idx: Collection[int], meta_keys: Optional[Collection[str]] = None
    ) -> np.ndarray:
        """Get metadata for specified indices.
        Args:
            idx (Collection[int]): Indices of entries.
            meta_keys (Optional[Collection[str]]): Names of metadata fields to return.
            If None, returns all available metadata.

        Returns:
            np.ndarray: Array of metadata values.
        """
        if meta_keys is None:
            return self._meta_data[np.array(idx)]
        col_indices = [self._meta_names.index(name) for name in meta_keys]
        return self._meta_data[np.array(idx)][:, col_indices]

available_features property

List of available features. Returns: list[str]: List of feature names

available_labels property

List of available labels. Returns: list[str]: List of label names

meta_info property

List of available metadata fields. Returns: list[str]: List of metadata field names

psmiles property

Return the polymer SMILES strings. Returns: np.ndarray: Array of polymer SMILES strings

__init__()

Initialize a dataset.

Source code in src/polymetrix/datasets/dataset.py
def __init__(self):
    """Initialize a dataset."""
    self._meta_data = None
    self._features = None
    self._labels = None
    self._psmiles = None
    self._feature_names = []
    self._label_names = []
    self._meta_names = []

__iter__()

Iterate over the features in the dataset.

Source code in src/polymetrix/datasets/dataset.py
def __iter__(self):
    """Iterate over the features in the dataset."""
    return iter(self._features)

__len__()

Return the number of entries in the dataset.

Source code in src/polymetrix/datasets/dataset.py
def __len__(self):
    """Return the number of entries in the dataset."""
    return len(self._features) if self._features is not None else 0

get_features(idx, feature_names=None)

Get features for specified indices. Args: idx (Collection[int]): Indices of entries. feature_names (Optional[Collection[str]]): Names of features to return. If None, returns all available features. Returns: np.ndarray: Array of feature values.

Source code in src/polymetrix/datasets/dataset.py
def get_features(
    self, idx: Collection[int], feature_names: Optional[Collection[str]] = None
) -> np.ndarray:
    """Get features for specified indices.
    Args:
        idx (Collection[int]): Indices of entries.
        feature_names (Optional[Collection[str]]): Names of features to return.
        If None, returns all available features.
    Returns:
        np.ndarray: Array of feature values.
    """
    if feature_names is None:
        return self._features[np.array(idx)]
    col_indices = [self._feature_names.index(name) for name in feature_names]
    return self._features[np.array(idx)][:, col_indices]

get_labels(idx, label_names=None)

Get labels for specified indices. Args: idx (Collection[int]): Indices of entries. label_names (Optional[Collection[str]]): Names of labels to return. If None, returns all available labels. Returns: np.ndarray: Array of label values.

Source code in src/polymetrix/datasets/dataset.py
def get_labels(
    self, idx: Collection[int], label_names: Optional[Collection[str]] = None
) -> np.ndarray:
    """Get labels for specified indices.
    Args:
        idx (Collection[int]): Indices of entries.
        label_names (Optional[Collection[str]]): Names of labels to return.
        If None, returns all available labels.
    Returns:
        np.ndarray: Array of label values.
    """
    if label_names is None:
        return self._labels[np.array(idx)]
    col_indices = [self._label_names.index(name) for name in label_names]
    return self._labels[np.array(idx)][:, col_indices]

get_meta(idx, meta_keys=None)

Get metadata for specified indices. Args: idx (Collection[int]): Indices of entries. meta_keys (Optional[Collection[str]]): Names of metadata fields to return. If None, returns all available metadata.

Returns:

Type Description
ndarray

np.ndarray: Array of metadata values.

Source code in src/polymetrix/datasets/dataset.py
def get_meta(
    self, idx: Collection[int], meta_keys: Optional[Collection[str]] = None
) -> np.ndarray:
    """Get metadata for specified indices.
    Args:
        idx (Collection[int]): Indices of entries.
        meta_keys (Optional[Collection[str]]): Names of metadata fields to return.
        If None, returns all available metadata.

    Returns:
        np.ndarray: Array of metadata values.
    """
    if meta_keys is None:
        return self._meta_data[np.array(idx)]
    col_indices = [self._meta_names.index(name) for name in meta_keys]
    return self._meta_data[np.array(idx)][:, col_indices]

get_subset(indices)

Get a subset of the dataset.

Source code in src/polymetrix/datasets/dataset.py
def get_subset(self, indices: Collection[int]) -> "AbstractDataset":
    """Get a subset of the dataset."""
    if not all(0 <= i < len(self) for i in indices):
        raise IndexError("Indices out of bounds.")
    subset = self.__class__()
    subset._features = self._features[indices]
    subset._labels = self._labels[indices]
    subset._meta_data = self._meta_data[indices]
    subset._psmiles = self._psmiles[indices] if self._psmiles is not None else None
    subset._feature_names = self._feature_names.copy()
    subset._label_names = self._label_names.copy()
    subset._meta_names = self._meta_names.copy()
    return subset

CuratedGlassTempDataset

Bases: AbstractDataset

Dataset for polymer glass transition temperature (Tg) data.

Source code in src/polymetrix/datasets/curated_tg_dataset.py
class CuratedGlassTempDataset(AbstractDataset):
    """Dataset for polymer glass transition temperature (Tg) data."""

    ALL_FEATURE_LEVELS = [
        "sidechainlevel",
        "backbonelevel",
        "fullpolymerlevel",
    ]
    FEATURE_PREFIX = "features."
    LABEL_PREFIX = "labels."
    META_PREFIX = "meta."

    DEFAULT_VERSION = "v1"
    DEFAULT_URL = "https://zenodo.org/records/15210035/files/LAMALAB_CURATED_Tg_structured_polymerclass.csv?download=1"

    def __init__(
        self,
        feature_levels: List[str] = ALL_FEATURE_LEVELS,
        subset: Optional[Collection[int]] = None,
    ):
        """Initialize the Tg dataset.
        Args:
           feature_levels (List[str]): Feature levels to include
           subset (Optional[Collection[int]]): Indices to include in the dataset
        """
        super().__init__()
        self._version = self.DEFAULT_VERSION
        self._url = self.DEFAULT_URL
        self._feature_levels = feature_levels

        # Validate feature levels using set operations
        if not set(self._feature_levels).issubset(self.ALL_FEATURE_LEVELS):
            raise ValueError(
                f"feature_levels must be a subset of {self.ALL_FEATURE_LEVELS}, "
                f"got {self._feature_levels}"
            )

        self._load_data(subset)

    def _load_data(self, subset: Optional[Collection[int]] = None):
        """Load and prepare the dataset."""
        csv_path = POLYMETRIX_PYSTOW_MODULE.ensure(
            "CuratedGlassTempDataset",
            self._version,
            url=self._url,
        )
        self._df = pd.read_csv(str(csv_path)).reset_index(drop=True)

        if subset is not None:
            self._df = self._df.iloc[subset].reset_index(drop=True)

        self._psmiles = self._df["PSMILES"].to_numpy()

        allowed_prefixes = [
            f"{level}.{self.FEATURE_PREFIX}" for level in self._feature_levels
        ]
        self._feature_names = self._filter_columns(allowed_prefixes)

        self._label_names = self._filter_columns([self.LABEL_PREFIX])
        self._meta_names = self._filter_columns([self.META_PREFIX])

        self._features = self._df[self._feature_names].to_numpy()
        self._labels = self._df[self._label_names].to_numpy()
        self._meta_data = self._df[self._meta_names].to_numpy()

    def _filter_columns(self, prefixes: List[str]) -> List[str]:
        """Helper to filter columns by prefix(es)."""
        return [
            col
            for col in self._df.columns
            if any(col.startswith(prefix) for prefix in prefixes)
        ]

    @property
    def df(self) -> pd.DataFrame:
        return self._df

    @property
    def active_feature_levels(self) -> List[str]:
        return self._feature_levels

    def get_subset(self, indices: Collection[int]) -> "CuratedGlassTempDataset":
        return CuratedGlassTempDataset(
            feature_levels=self._feature_levels,
            subset=indices,
        )

__init__(feature_levels=ALL_FEATURE_LEVELS, subset=None)

Initialize the Tg dataset. Args: feature_levels (List[str]): Feature levels to include subset (Optional[Collection[int]]): Indices to include in the dataset

Source code in src/polymetrix/datasets/curated_tg_dataset.py
def __init__(
    self,
    feature_levels: List[str] = ALL_FEATURE_LEVELS,
    subset: Optional[Collection[int]] = None,
):
    """Initialize the Tg dataset.
    Args:
       feature_levels (List[str]): Feature levels to include
       subset (Optional[Collection[int]]): Indices to include in the dataset
    """
    super().__init__()
    self._version = self.DEFAULT_VERSION
    self._url = self.DEFAULT_URL
    self._feature_levels = feature_levels

    # Validate feature levels using set operations
    if not set(self._feature_levels).issubset(self.ALL_FEATURE_LEVELS):
        raise ValueError(
            f"feature_levels must be a subset of {self.ALL_FEATURE_LEVELS}, "
            f"got {self._feature_levels}"
        )

    self._load_data(subset)

curated_tg_dataset

CuratedGlassTempDataset

Bases: AbstractDataset

Dataset for polymer glass transition temperature (Tg) data.

Source code in src/polymetrix/datasets/curated_tg_dataset.py
class CuratedGlassTempDataset(AbstractDataset):
    """Dataset for polymer glass transition temperature (Tg) data."""

    ALL_FEATURE_LEVELS = [
        "sidechainlevel",
        "backbonelevel",
        "fullpolymerlevel",
    ]
    FEATURE_PREFIX = "features."
    LABEL_PREFIX = "labels."
    META_PREFIX = "meta."

    DEFAULT_VERSION = "v1"
    DEFAULT_URL = "https://zenodo.org/records/15210035/files/LAMALAB_CURATED_Tg_structured_polymerclass.csv?download=1"

    def __init__(
        self,
        feature_levels: List[str] = ALL_FEATURE_LEVELS,
        subset: Optional[Collection[int]] = None,
    ):
        """Initialize the Tg dataset.
        Args:
           feature_levels (List[str]): Feature levels to include
           subset (Optional[Collection[int]]): Indices to include in the dataset
        """
        super().__init__()
        self._version = self.DEFAULT_VERSION
        self._url = self.DEFAULT_URL
        self._feature_levels = feature_levels

        # Validate feature levels using set operations
        if not set(self._feature_levels).issubset(self.ALL_FEATURE_LEVELS):
            raise ValueError(
                f"feature_levels must be a subset of {self.ALL_FEATURE_LEVELS}, "
                f"got {self._feature_levels}"
            )

        self._load_data(subset)

    def _load_data(self, subset: Optional[Collection[int]] = None):
        """Load and prepare the dataset."""
        csv_path = POLYMETRIX_PYSTOW_MODULE.ensure(
            "CuratedGlassTempDataset",
            self._version,
            url=self._url,
        )
        self._df = pd.read_csv(str(csv_path)).reset_index(drop=True)

        if subset is not None:
            self._df = self._df.iloc[subset].reset_index(drop=True)

        self._psmiles = self._df["PSMILES"].to_numpy()

        allowed_prefixes = [
            f"{level}.{self.FEATURE_PREFIX}" for level in self._feature_levels
        ]
        self._feature_names = self._filter_columns(allowed_prefixes)

        self._label_names = self._filter_columns([self.LABEL_PREFIX])
        self._meta_names = self._filter_columns([self.META_PREFIX])

        self._features = self._df[self._feature_names].to_numpy()
        self._labels = self._df[self._label_names].to_numpy()
        self._meta_data = self._df[self._meta_names].to_numpy()

    def _filter_columns(self, prefixes: List[str]) -> List[str]:
        """Helper to filter columns by prefix(es)."""
        return [
            col
            for col in self._df.columns
            if any(col.startswith(prefix) for prefix in prefixes)
        ]

    @property
    def df(self) -> pd.DataFrame:
        return self._df

    @property
    def active_feature_levels(self) -> List[str]:
        return self._feature_levels

    def get_subset(self, indices: Collection[int]) -> "CuratedGlassTempDataset":
        return CuratedGlassTempDataset(
            feature_levels=self._feature_levels,
            subset=indices,
        )
__init__(feature_levels=ALL_FEATURE_LEVELS, subset=None)

Initialize the Tg dataset. Args: feature_levels (List[str]): Feature levels to include subset (Optional[Collection[int]]): Indices to include in the dataset

Source code in src/polymetrix/datasets/curated_tg_dataset.py
def __init__(
    self,
    feature_levels: List[str] = ALL_FEATURE_LEVELS,
    subset: Optional[Collection[int]] = None,
):
    """Initialize the Tg dataset.
    Args:
       feature_levels (List[str]): Feature levels to include
       subset (Optional[Collection[int]]): Indices to include in the dataset
    """
    super().__init__()
    self._version = self.DEFAULT_VERSION
    self._url = self.DEFAULT_URL
    self._feature_levels = feature_levels

    # Validate feature levels using set operations
    if not set(self._feature_levels).issubset(self.ALL_FEATURE_LEVELS):
        raise ValueError(
            f"feature_levels must be a subset of {self.ALL_FEATURE_LEVELS}, "
            f"got {self._feature_levels}"
        )

    self._load_data(subset)

dataset

AbstractDataset

Bases: ABC

Base class for polymer datasets.

Source code in src/polymetrix/datasets/dataset.py
class AbstractDataset(ABC):
    """Base class for polymer datasets."""

    def __init__(self):
        """Initialize a dataset."""
        self._meta_data = None
        self._features = None
        self._labels = None
        self._psmiles = None
        self._feature_names = []
        self._label_names = []
        self._meta_names = []

    @abstractmethod
    def _load_data(self, subset: Optional[Collection[int]] = None):
        """Load and prepare the dataset-specific data.

        Args:
            subset (Optional[Collection[int]]): Indices to include in the dataset.
        """
        pass

    def get_subset(self, indices: Collection[int]) -> "AbstractDataset":
        """Get a subset of the dataset."""
        if not all(0 <= i < len(self) for i in indices):
            raise IndexError("Indices out of bounds.")
        subset = self.__class__()
        subset._features = self._features[indices]
        subset._labels = self._labels[indices]
        subset._meta_data = self._meta_data[indices]
        subset._psmiles = self._psmiles[indices] if self._psmiles is not None else None
        subset._feature_names = self._feature_names.copy()
        subset._label_names = self._label_names.copy()
        subset._meta_names = self._meta_names.copy()
        return subset

    @property
    def available_features(self) -> list[str]:
        """List of available features.
        Returns:
            list[str]: List of feature names
        """
        return self._feature_names

    @property
    def available_labels(self) -> list[str]:
        """List of available labels.
        Returns:
            list[str]: List of label names
        """
        return self._label_names

    @property
    def meta_info(self) -> list[str]:
        """List of available metadata fields.
        Returns:
            list[str]: List of metadata field names
        """
        return self._meta_names

    @property
    def psmiles(self) -> np.ndarray:
        """Return the polymer SMILES strings.
        Returns:
            np.ndarray: Array of polymer SMILES strings
        """
        return self._psmiles

    def __len__(self):
        """Return the number of entries in the dataset."""
        return len(self._features) if self._features is not None else 0

    def __iter__(self):
        """Iterate over the features in the dataset."""
        return iter(self._features)

    def get_features(
        self, idx: Collection[int], feature_names: Optional[Collection[str]] = None
    ) -> np.ndarray:
        """Get features for specified indices.
        Args:
            idx (Collection[int]): Indices of entries.
            feature_names (Optional[Collection[str]]): Names of features to return.
            If None, returns all available features.
        Returns:
            np.ndarray: Array of feature values.
        """
        if feature_names is None:
            return self._features[np.array(idx)]
        col_indices = [self._feature_names.index(name) for name in feature_names]
        return self._features[np.array(idx)][:, col_indices]

    def get_labels(
        self, idx: Collection[int], label_names: Optional[Collection[str]] = None
    ) -> np.ndarray:
        """Get labels for specified indices.
        Args:
            idx (Collection[int]): Indices of entries.
            label_names (Optional[Collection[str]]): Names of labels to return.
            If None, returns all available labels.
        Returns:
            np.ndarray: Array of label values.
        """
        if label_names is None:
            return self._labels[np.array(idx)]
        col_indices = [self._label_names.index(name) for name in label_names]
        return self._labels[np.array(idx)][:, col_indices]

    def get_meta(
        self, idx: Collection[int], meta_keys: Optional[Collection[str]] = None
    ) -> np.ndarray:
        """Get metadata for specified indices.
        Args:
            idx (Collection[int]): Indices of entries.
            meta_keys (Optional[Collection[str]]): Names of metadata fields to return.
            If None, returns all available metadata.

        Returns:
            np.ndarray: Array of metadata values.
        """
        if meta_keys is None:
            return self._meta_data[np.array(idx)]
        col_indices = [self._meta_names.index(name) for name in meta_keys]
        return self._meta_data[np.array(idx)][:, col_indices]
available_features property

List of available features. Returns: list[str]: List of feature names

available_labels property

List of available labels. Returns: list[str]: List of label names

meta_info property

List of available metadata fields. Returns: list[str]: List of metadata field names

psmiles property

Return the polymer SMILES strings. Returns: np.ndarray: Array of polymer SMILES strings

__init__()

Initialize a dataset.

Source code in src/polymetrix/datasets/dataset.py
def __init__(self):
    """Initialize a dataset."""
    self._meta_data = None
    self._features = None
    self._labels = None
    self._psmiles = None
    self._feature_names = []
    self._label_names = []
    self._meta_names = []
__iter__()

Iterate over the features in the dataset.

Source code in src/polymetrix/datasets/dataset.py
def __iter__(self):
    """Iterate over the features in the dataset."""
    return iter(self._features)
__len__()

Return the number of entries in the dataset.

Source code in src/polymetrix/datasets/dataset.py
def __len__(self):
    """Return the number of entries in the dataset."""
    return len(self._features) if self._features is not None else 0
get_features(idx, feature_names=None)

Get features for specified indices. Args: idx (Collection[int]): Indices of entries. feature_names (Optional[Collection[str]]): Names of features to return. If None, returns all available features. Returns: np.ndarray: Array of feature values.

Source code in src/polymetrix/datasets/dataset.py
def get_features(
    self, idx: Collection[int], feature_names: Optional[Collection[str]] = None
) -> np.ndarray:
    """Get features for specified indices.
    Args:
        idx (Collection[int]): Indices of entries.
        feature_names (Optional[Collection[str]]): Names of features to return.
        If None, returns all available features.
    Returns:
        np.ndarray: Array of feature values.
    """
    if feature_names is None:
        return self._features[np.array(idx)]
    col_indices = [self._feature_names.index(name) for name in feature_names]
    return self._features[np.array(idx)][:, col_indices]
get_labels(idx, label_names=None)

Get labels for specified indices. Args: idx (Collection[int]): Indices of entries. label_names (Optional[Collection[str]]): Names of labels to return. If None, returns all available labels. Returns: np.ndarray: Array of label values.

Source code in src/polymetrix/datasets/dataset.py
def get_labels(
    self, idx: Collection[int], label_names: Optional[Collection[str]] = None
) -> np.ndarray:
    """Get labels for specified indices.
    Args:
        idx (Collection[int]): Indices of entries.
        label_names (Optional[Collection[str]]): Names of labels to return.
        If None, returns all available labels.
    Returns:
        np.ndarray: Array of label values.
    """
    if label_names is None:
        return self._labels[np.array(idx)]
    col_indices = [self._label_names.index(name) for name in label_names]
    return self._labels[np.array(idx)][:, col_indices]
get_meta(idx, meta_keys=None)

Get metadata for specified indices. Args: idx (Collection[int]): Indices of entries. meta_keys (Optional[Collection[str]]): Names of metadata fields to return. If None, returns all available metadata.

Returns:

Type Description
ndarray

np.ndarray: Array of metadata values.

Source code in src/polymetrix/datasets/dataset.py
def get_meta(
    self, idx: Collection[int], meta_keys: Optional[Collection[str]] = None
) -> np.ndarray:
    """Get metadata for specified indices.
    Args:
        idx (Collection[int]): Indices of entries.
        meta_keys (Optional[Collection[str]]): Names of metadata fields to return.
        If None, returns all available metadata.

    Returns:
        np.ndarray: Array of metadata values.
    """
    if meta_keys is None:
        return self._meta_data[np.array(idx)]
    col_indices = [self._meta_names.index(name) for name in meta_keys]
    return self._meta_data[np.array(idx)][:, col_indices]
get_subset(indices)

Get a subset of the dataset.

Source code in src/polymetrix/datasets/dataset.py
def get_subset(self, indices: Collection[int]) -> "AbstractDataset":
    """Get a subset of the dataset."""
    if not all(0 <= i < len(self) for i in indices):
        raise IndexError("Indices out of bounds.")
    subset = self.__class__()
    subset._features = self._features[indices]
    subset._labels = self._labels[indices]
    subset._meta_data = self._meta_data[indices]
    subset._psmiles = self._psmiles[indices] if self._psmiles is not None else None
    subset._feature_names = self._feature_names.copy()
    subset._label_names = self._label_names.copy()
    subset._meta_names = self._meta_names.copy()
    return subset

featurizers

base_featurizer

BaseFeatureCalculator

Source code in src/polymetrix/featurizers/base_featurizer.py
class BaseFeatureCalculator:
    agg_funcs = {
        "mean": np.mean,
        "min": np.min,
        "max": np.max,
        "sum": np.sum,
    }

    def __init__(self, agg: List[str] = None):
        if agg is None:
            agg = ["sum"]
        self.agg = agg

    def _sanitize(self, mol: Chem.Mol, sanitize: bool) -> None:
        """Handle molecule sanitization with kekulization exception handling."""
        if sanitize:
            try:
                Chem.SanitizeMol(
                    mol, sanitizeOps=Chem.SANITIZE_ALL ^ Chem.SANITIZE_KEKULIZE
                )
            except Chem.AtomKekulizeException:
                mol.UpdatePropertyCache()

    def calculate(self, mol: Chem.Mol) -> np.ndarray:
        raise NotImplementedError("Calculate method must be implemented by subclasses")

    def feature_base_labels(self) -> List[str]:
        raise NotImplementedError(
            "Feature labels method must be implemented by subclasses"
        )

    def feature_labels(self) -> List[str]:
        return [
            f"{label}_{agg}" for label in self.feature_base_labels() for agg in self.agg
        ]

    def aggregate(self, features: List) -> np.ndarray:
        """
        Aggregates a list of features using the aggregation functions specified in self.agg.
        If the features are numpy arrays, the aggregation is applied along the first axis.
        Otherwise, the aggregation is applied directly (assuming the features are scalar numeric values).
        """
        results = []
        if not features:
            return np.array([])

        # Check whether features are numpy arrays by testing the first element.
        first_elem = features[0]
        if isinstance(first_elem, np.ndarray):
            for agg_func in self.agg:
                if agg_func not in self.agg_funcs:
                    raise ValueError(f"Unknown aggregation function: {agg_func}")
                aggregated = self.agg_funcs[agg_func](features, axis=0)
                results.append(aggregated)
            return np.concatenate(results)
        else:
            for agg_func in self.agg:
                if agg_func not in self.agg_funcs:
                    raise ValueError(f"Unknown aggregation function: {agg_func}")
                results.append(self.agg_funcs[agg_func](features))
            return np.array(results)

    def get_feature_names(self) -> List[str]:
        raise NotImplementedError(
            "Get feature name method must be implemented by subclasses"
        )

    def citations(self) -> List[str]:
        return []

    def implementors(self) -> List[str]:
        return []
aggregate(features)

Aggregates a list of features using the aggregation functions specified in self.agg. If the features are numpy arrays, the aggregation is applied along the first axis. Otherwise, the aggregation is applied directly (assuming the features are scalar numeric values).

Source code in src/polymetrix/featurizers/base_featurizer.py
def aggregate(self, features: List) -> np.ndarray:
    """
    Aggregates a list of features using the aggregation functions specified in self.agg.
    If the features are numpy arrays, the aggregation is applied along the first axis.
    Otherwise, the aggregation is applied directly (assuming the features are scalar numeric values).
    """
    results = []
    if not features:
        return np.array([])

    # Check whether features are numpy arrays by testing the first element.
    first_elem = features[0]
    if isinstance(first_elem, np.ndarray):
        for agg_func in self.agg:
            if agg_func not in self.agg_funcs:
                raise ValueError(f"Unknown aggregation function: {agg_func}")
            aggregated = self.agg_funcs[agg_func](features, axis=0)
            results.append(aggregated)
        return np.concatenate(results)
    else:
        for agg_func in self.agg:
            if agg_func not in self.agg_funcs:
                raise ValueError(f"Unknown aggregation function: {agg_func}")
            results.append(self.agg_funcs[agg_func](features))
        return np.array(results)

chemical_featurizer

BalabanJIndex

Bases: GenericScalarFeaturizer

Measures molecular complexity and connectivity of atoms.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class BalabanJIndex(GenericScalarFeaturizer):
    """
    Measures molecular complexity and connectivity of atoms.
    """

    def __init__(self, agg: List[str] = None):
        super().__init__(GraphDescriptors.BalabanJ, "balaban_j_index", agg=agg)

BondCounts

Bases: BaseFeatureCalculator

Counts the number of single, double, and triple bonds in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class BondCounts(BaseFeatureCalculator):
    """
    Counts the number of single, double, and triple bonds in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        bond_types = {
            Chem.BondType.SINGLE: 0,
            Chem.BondType.DOUBLE: 0,
            Chem.BondType.TRIPLE: 0,
        }
        for bond in mol.GetBonds():
            if bond.GetBondType() in bond_types:
                bond_types[bond.GetBondType()] += 1
        return np.array(
            [
                bond_types[Chem.BondType.SINGLE],
                bond_types[Chem.BondType.DOUBLE],
                bond_types[Chem.BondType.TRIPLE],
            ]
        )

    def feature_base_labels(self) -> List[str]:
        return ["single_bonds", "double_bonds", "triple_bonds"]

BridgingRingsCount

Bases: BaseFeatureCalculator

Counts the number of bridging rings in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class BridgingRingsCount(BaseFeatureCalculator):
    """
    Counts the number of bridging rings in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        ring_info = mol.GetRingInfo()
        rings = ring_info.AtomRings()
        bridging_rings = 0

        for i in range(len(rings)):
            for j in range(i + 1, len(rings)):
                if len(set(rings[i]) & set(rings[j])) >= 2:
                    bridging_rings += 1
                    break

        return np.array([bridging_rings])

    def feature_base_labels(self) -> List[str]:
        return ["bridging_rings_count"]

FpDensityMorgan1

Bases: GenericScalarFeaturizer

Calculates the density of the Morgan1 fingerprint.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class FpDensityMorgan1(GenericScalarFeaturizer):
    """
    Calculates the density of the Morgan1 fingerprint.
    """

    def __init__(self, agg: List[str] = None):
        super().__init__(Descriptors.FpDensityMorgan1, "fp_density_morgan1", agg=agg)

FractionBicyclicRings

Bases: BaseFeatureCalculator

Calculates the fraction of bicyclic rings in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class FractionBicyclicRings(BaseFeatureCalculator):
    """
    Calculates the fraction of bicyclic rings in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        ring_info = mol.GetRingInfo()
        atom_rings = ring_info.AtomRings()
        bicyclic_count = sum(
            1
            for i, ring1 in enumerate(atom_rings)
            for ring2 in atom_rings[i + 1 :]
            if set(ring1) & set(ring2)
        )
        return np.array([bicyclic_count / len(atom_rings) if atom_rings else 0])

    def feature_base_labels(self) -> List[str]:
        return ["fraction_bicyclic_rings"]

HalogenCounts

Bases: BaseFeatureCalculator

Counts the number of halogen atoms in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class HalogenCounts(BaseFeatureCalculator):
    """
    Counts the number of halogen atoms in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        halogen_counts = {9: 0, 17: 0, 35: 0, 53: 0}  # F, Cl, Br, I
        for atom in mol.GetAtoms():
            atomic_num = atom.GetAtomicNum()
            if atomic_num in halogen_counts:
                halogen_counts[atomic_num] += 1

        total_halogens = sum(halogen_counts.values())

        return np.array(
            [
                total_halogens,
                halogen_counts[9],
                halogen_counts[17],
                halogen_counts[35],
                halogen_counts[53],
            ]
        )

    def feature_base_labels(self) -> List[str]:
        return [
            "total_halogens",
            "fluorine_count",
            "chlorine_count",
            "bromine_count",
            "iodine_count",
        ]

HeteroatomCount

Bases: BaseFeatureCalculator

Counts heteroatoms (non-C, non-H) in heterocyclic rings.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class HeteroatomCount(BaseFeatureCalculator):
    """
    Counts heteroatoms (non-C, non-H) in heterocyclic rings.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        return np.array([sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() != 6)])

    def feature_base_labels(self) -> List[str]:
        return ["heteroatom_count"]

HeteroatomDensity

Bases: BaseFeatureCalculator

Density of heteroatoms in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class HeteroatomDensity(BaseFeatureCalculator):
    """
    Density of heteroatoms in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        num_atoms = mol.GetNumAtoms()
        num_heteroatoms = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() != 6)
        return np.array([num_heteroatoms / num_atoms if num_atoms else 0])

    def feature_base_labels(self) -> List[str]:
        return ["heteroatom_density"]

MaxEStateIndex

Bases: GenericScalarFeaturizer

Maximum electronic state index, reflecting charge distribution.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class MaxEStateIndex(GenericScalarFeaturizer):
    """
    Maximum electronic state index, reflecting charge distribution.
    """

    def __init__(self, agg: List[str] = None):
        super().__init__(Descriptors.MaxEStateIndex, "max_estate_index", agg=agg)

MaxRingSize

Bases: BaseFeatureCalculator

Calculates the size of the largest ring in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class MaxRingSize(BaseFeatureCalculator):
    """
    Calculates the size of the largest ring in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        rings = mol.GetRingInfo().AtomRings()
        return np.array([max(map(len, rings)) if rings else 0])

    def feature_base_labels(self) -> List[str]:
        return ["max_ring_size"]

MolecularWeight

Bases: GenericScalarFeaturizer

Calculates the molecular weight of the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class MolecularWeight(GenericScalarFeaturizer):
    """
    Calculates the molecular weight of the molecule.
    """

    def __init__(self, agg: List[str] = None):
        super().__init__(Descriptors.ExactMolWt, "molecular_weight", agg=agg)

NumAliphaticHeterocycles

Bases: BaseFeatureCalculator

Counts the number of aliphatic heterocycles in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class NumAliphaticHeterocycles(BaseFeatureCalculator):
    """
    Counts the number of aliphatic heterocycles in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        num_heterocycles = 0
        for ring in mol.GetRingInfo().AtomRings():
            if any(mol.GetAtomWithIdx(atom).GetAtomicNum() != 6 for atom in ring):
                num_heterocycles += 1
        return np.array([num_heterocycles])

    def feature_base_labels(self) -> List[str]:
        return ["num_aliphatic_heterocycles"]

NumAromaticRings

Bases: BaseFeatureCalculator

Counts the number of aromatic rings in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class NumAromaticRings(BaseFeatureCalculator):
    """
    Counts the number of aromatic rings in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        return np.array(
            [
                sum(
                    1
                    for ring in mol.GetRingInfo().AtomRings()
                    if all(mol.GetAtomWithIdx(i).GetIsAromatic() for i in ring)
                )
            ]
        )

    def feature_base_labels(self) -> List[str]:
        return ["num_aromatic_rings"]

NumAtoms

Bases: BaseFeatureCalculator

Counts the number of atoms in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class NumAtoms(BaseFeatureCalculator):
    """
    Counts the number of atoms in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = False) -> np.ndarray:
        return np.array([mol.GetNumAtoms()])

    def feature_base_labels(self) -> List[str]:
        return ["num_atoms"]

NumHBondAcceptors

Bases: GenericScalarFeaturizer

Counts Number of hydrogen bond acceptors.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class NumHBondAcceptors(GenericScalarFeaturizer):
    """
    Counts Number of hydrogen bond acceptors.
    """

    def __init__(self, agg: List[str] = None):
        super().__init__(Descriptors.NumHAcceptors, "num_hbond_acceptors", agg=agg)

NumHBondDonors

Bases: GenericScalarFeaturizer

Counts Number of hydrogen bond donors.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class NumHBondDonors(GenericScalarFeaturizer):
    """
    Counts Number of hydrogen bond donors.
    """

    def __init__(self, agg: List[str] = None):
        super().__init__(Descriptors.NumHDonors, "num_hbond_donors", agg=agg)

NumNonAromaticRings

Bases: BaseFeatureCalculator

Counts the number of non-aromatic rings in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class NumNonAromaticRings(BaseFeatureCalculator):
    """
    Counts the number of non-aromatic rings in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        return np.array(
            [
                sum(
                    1
                    for ring in mol.GetRingInfo().AtomRings()
                    if not all(mol.GetAtomWithIdx(i).GetIsAromatic() for i in ring)
                )
            ]
        )

    def feature_base_labels(self) -> List[str]:
        return ["num_non_aromatic_rings"]

NumRings

Bases: BaseFeatureCalculator

Counts the number of rings in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class NumRings(BaseFeatureCalculator):
    """
    Counts the number of rings in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        return np.array([len(mol.GetRingInfo().AtomRings())])

    def feature_base_labels(self) -> List[str]:
        return ["num_rings"]

NumRotatableBonds

Bases: GenericScalarFeaturizer

Counts Number of rotatable bonds.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class NumRotatableBonds(GenericScalarFeaturizer):
    """
    Counts Number of rotatable bonds.
    """

    def __init__(self, agg: List[str] = None):
        super().__init__(Descriptors.NumRotatableBonds, "num_rotatable_bonds", agg=agg)

SlogPVSA1

Bases: GenericScalarFeaturizer

Calculates the Surface area contributing to octanol solubility, linked to lipophilicity.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class SlogPVSA1(GenericScalarFeaturizer):
    """
    Calculates the Surface area contributing to octanol solubility, linked to lipophilicity.
    """

    def __init__(self, agg: List[str] = None):
        super().__init__(Descriptors.SlogP_VSA1, "slogp_vsa1", agg=agg)

SmrVSA5

Bases: GenericScalarFeaturizer

Molar refractivity sum for atoms with specific surface area (2.45–2.75).

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class SmrVSA5(GenericScalarFeaturizer):
    """
    Molar refractivity sum for atoms with specific surface area (2.45–2.75).
    """

    def __init__(self, agg: List[str] = None):
        super().__init__(Descriptors.SMR_VSA5, "smr_vsa5", agg=agg)

Sp2CarbonCountFeaturizer

Bases: BaseFeatureCalculator

Counts the number of sp2 hybridized carbon atoms in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class Sp2CarbonCountFeaturizer(BaseFeatureCalculator):
    """
    Counts the number of sp2 hybridized carbon atoms in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        sp2_count = sum(
            1
            for atom in mol.GetAtoms()
            if atom.GetHybridization() == Chem.HybridizationType.SP2
        )
        return np.array([sp2_count])

    def feature_base_labels(self) -> List[str]:
        return ["sp2_carbon_count"]

Sp3CarbonCountFeaturizer

Bases: BaseFeatureCalculator

Counts the number of sp3 hybridized carbon atoms in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class Sp3CarbonCountFeaturizer(BaseFeatureCalculator):
    """
    Counts the number of sp3 hybridized carbon atoms in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        sp3_count = sum(
            1
            for atom in mol.GetAtoms()
            if atom.GetHybridization() == Chem.HybridizationType.SP3
        )
        return np.array([sp3_count])

    def feature_base_labels(self) -> List[str]:
        return ["sp3_carbon_count"]

TopologicalSurfaceArea

Bases: GenericScalarFeaturizer

Calculates the topological polar surface area.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class TopologicalSurfaceArea(GenericScalarFeaturizer):
    """
    Calculates the topological polar surface area.
    """

    def __init__(self, agg: List[str] = None):
        super().__init__(Descriptors.TPSA, "topological_surface_area", agg=agg)

polymer

Polymer

A class to represent a polymer molecule and extract its backbone and sidechain information.

Attributes:

Name Type Description
psmiles Optional[str]

Optional[str], the pSMILES string representing the polymer molecule.

graph Graph

Optional[nx.Graph], a NetworkX graph representing the polymer structure.

backbone_nodes List[int]

Optional[List[int]], list of node indices forming the polymer backbone.

sidechain_nodes List[int]

Optional[List[int]], list of node indices forming the sidechains.

connection_points List[int]

Optional[List[int]], list of node indices representing connection points.

Raises:

Type Description
ValueError

If the provided pSMILES string is invalid or cannot be processed.

Source code in src/polymetrix/featurizers/polymer.py
class Polymer:
    """A class to represent a polymer molecule and extract its backbone and sidechain information.

    Attributes:
        psmiles: Optional[str], the pSMILES string representing the polymer molecule.
        graph: Optional[nx.Graph], a NetworkX graph representing the polymer structure.
        backbone_nodes: Optional[List[int]], list of node indices forming the polymer backbone.
        sidechain_nodes: Optional[List[int]], list of node indices forming the sidechains.
        connection_points: Optional[List[int]], list of node indices representing connection points.

    Raises:
        ValueError: If the provided pSMILES string is invalid or cannot be processed.
    """

    def __init__(self):
        self._psmiles: Optional[str] = None
        self._graph: Optional[nx.Graph] = None
        self._backbone_nodes: Optional[List[int]] = None
        self._sidechain_nodes: Optional[List[int]] = None
        self._connection_points: Optional[List[int]] = None

    @classmethod
    def from_psmiles(cls, psmiles: str) -> "Polymer":
        """Creates a Polymer instance from a pSMILES string.

        Args:
            psmiles: str, the pSMILES string representing the polymer molecule.

        Returns:
            Polymer: A new Polymer object initialized with the given pSMILES string.

        Raises:
            ValueError: If the pSMILES string is invalid.
        """
        polymer = cls()
        polymer.psmiles = psmiles
        return polymer

    @property
    def psmiles(self) -> Optional[str]:
        """Gets the pSMILES string of the polymer.

        Returns:
            Optional[str]: The pSMILES string, or None if not set.
        """
        return self._psmiles

    @psmiles.setter
    def psmiles(self, value: str):
        """Sets the pSMILES string and updates the polymer's internal structure.

        Args:
            value: str, the pSMILES string to set.

        Raises:
            ValueError: If the pSMILES string is invalid or cannot be processed.
        """
        try:
            mol = Chem.MolFromSmiles(value)
            if mol is None:
                raise ValueError("Invalid pSMILES string")
            self._psmiles = value
            self._graph = self._mol_to_nx(mol)
            self._identify_connection_points()
            self._identify_backbone_and_sidechain()
        except Exception as e:
            raise ValueError(f"Error processing pSMILES: {str(e)}") from e

    def _mol_to_nx(self, mol: Chem.Mol) -> nx.Graph:
        """Converts an RDKit molecule to a NetworkX graph.

        Args:
            mol: Chem.Mol, the RDKit molecule object to convert.

        Returns:
            nx.Graph: A NetworkX graph representing the molecule's structure.
        """
        G = nx.Graph()
        for atom in mol.GetAtoms():
            G.add_node(
                atom.GetIdx(),
                atomic_num=atom.GetAtomicNum(),
                element=atom.GetSymbol(),
                formal_charge=atom.GetFormalCharge(),
                is_aromatic=atom.GetIsAromatic(),
            )
        for bond in mol.GetBonds():
            G.add_edge(
                bond.GetBeginAtomIdx(),
                bond.GetEndAtomIdx(),
                bond_type=bond.GetBondType(),
                is_aromatic=bond.GetIsAromatic(),
            )
        return G

    def _identify_connection_points(self):
        """Identifies connection points (asterisk atoms) in the polymer graph."""
        self._connection_points = [
            node
            for node, data in self._graph.nodes(data=True)
            if data["element"] == "*"
        ]

    def _identify_backbone_and_sidechain(self):
        """Classifies nodes into backbone and sidechain components."""
        self._backbone_nodes, self._sidechain_nodes = classify_backbone_and_sidechains(
            self._graph
        )

    @property
    def backbone_nodes(self) -> List[int]:
        """Gets the list of backbone node indices.

        Returns:
            List[int]: List of node indices representing the backbone.
        """
        return self._backbone_nodes

    @property
    def sidechain_nodes(self) -> List[int]:
        """Gets the list of sidechain node indices.

        Returns:
            List[int]: List of node indices representing the sidechains.
        """
        return self._sidechain_nodes

    @property
    def graph(self) -> nx.Graph:
        """Gets the NetworkX graph of the polymer.

        Returns:
            nx.Graph: The graph representing the polymer structure.
        """
        return self._graph

    def get_backbone_and_sidechain_molecules(
        self,
    ) -> Tuple[List[Chem.Mol], List[Chem.Mol]]:
        """Extracts RDKit molecule objects for the backbone and sidechains.

        Returns:
            Tuple[List[Chem.Mol], List[Chem.Mol]]: A tuple containing a list with the backbone
                molecule and a list of sidechain molecules.
        """
        backbone_mol = self._subgraph_to_mol(self._graph.subgraph(self._backbone_nodes))
        sidechain_mols = [
            self._subgraph_to_mol(self._graph.subgraph(nodes))
            for nodes in nx.connected_components(
                self._graph.subgraph(self._sidechain_nodes)
            )
        ]
        return [backbone_mol], sidechain_mols

    def get_backbone_and_sidechain_graphs(self) -> Tuple[nx.Graph, List[nx.Graph]]:
        """Extracts NetworkX graphs for the backbone and sidechains.

        Returns:
            Tuple[nx.Graph, List[nx.Graph]]: A tuple containing the backbone graph and a list
                of sidechain graphs.
        """
        backbone_graph = self._graph.subgraph(self._backbone_nodes)
        sidechain_graphs = [
            self._graph.subgraph(nodes)
            for nodes in nx.connected_components(
                self._graph.subgraph(self._sidechain_nodes)
            )
        ]
        return [backbone_graph], sidechain_graphs

    def _subgraph_to_mol(self, subgraph: nx.Graph) -> Chem.Mol:
        """Converts a NetworkX subgraph to an RDKit molecule.

        Args:
            subgraph: nx.Graph, the subgraph to convert.

        Returns:
            Chem.Mol: The RDKit molecule object created from the subgraph.
        """
        mol = Chem.RWMol()
        node_to_idx = {}
        for node in subgraph.nodes():
            atom = Chem.Atom(subgraph.nodes[node]["atomic_num"])
            if "formal_charge" in subgraph.nodes[node]:
                atom.SetFormalCharge(subgraph.nodes[node]["formal_charge"])
            idx = mol.AddAtom(atom)
            node_to_idx[node] = idx
        for u, v, data in subgraph.edges(data=True):
            mol.AddBond(node_to_idx[u], node_to_idx[v], data["bond_type"])
        return mol.GetMol()

    def calculate_molecular_weight(self) -> float:
        """Calculates the exact molecular weight of the polymer.

        Returns:
            float: The molecular weight of the polymer molecule.
        """
        mol = Chem.MolFromSmiles(self._psmiles)
        return ExactMolWt(mol)

    def get_connection_points(self) -> List[int]:
        """Gets the list of connection point node indices.

        Returns:
            List[int]: List of node indices representing connection points.
        """
        return self._connection_points
backbone_nodes property

Gets the list of backbone node indices.

Returns:

Type Description
List[int]

List[int]: List of node indices representing the backbone.

graph property

Gets the NetworkX graph of the polymer.

Returns:

Type Description
Graph

nx.Graph: The graph representing the polymer structure.

psmiles property writable

Gets the pSMILES string of the polymer.

Returns:

Type Description
Optional[str]

Optional[str]: The pSMILES string, or None if not set.

sidechain_nodes property

Gets the list of sidechain node indices.

Returns:

Type Description
List[int]

List[int]: List of node indices representing the sidechains.

calculate_molecular_weight()

Calculates the exact molecular weight of the polymer.

Returns:

Name Type Description
float float

The molecular weight of the polymer molecule.

Source code in src/polymetrix/featurizers/polymer.py
def calculate_molecular_weight(self) -> float:
    """Calculates the exact molecular weight of the polymer.

    Returns:
        float: The molecular weight of the polymer molecule.
    """
    mol = Chem.MolFromSmiles(self._psmiles)
    return ExactMolWt(mol)
from_psmiles(psmiles) classmethod

Creates a Polymer instance from a pSMILES string.

Parameters:

Name Type Description Default
psmiles str

str, the pSMILES string representing the polymer molecule.

required

Returns:

Name Type Description
Polymer Polymer

A new Polymer object initialized with the given pSMILES string.

Raises:

Type Description
ValueError

If the pSMILES string is invalid.

Source code in src/polymetrix/featurizers/polymer.py
@classmethod
def from_psmiles(cls, psmiles: str) -> "Polymer":
    """Creates a Polymer instance from a pSMILES string.

    Args:
        psmiles: str, the pSMILES string representing the polymer molecule.

    Returns:
        Polymer: A new Polymer object initialized with the given pSMILES string.

    Raises:
        ValueError: If the pSMILES string is invalid.
    """
    polymer = cls()
    polymer.psmiles = psmiles
    return polymer
get_backbone_and_sidechain_graphs()

Extracts NetworkX graphs for the backbone and sidechains.

Returns:

Type Description
Tuple[Graph, List[Graph]]

Tuple[nx.Graph, List[nx.Graph]]: A tuple containing the backbone graph and a list of sidechain graphs.

Source code in src/polymetrix/featurizers/polymer.py
def get_backbone_and_sidechain_graphs(self) -> Tuple[nx.Graph, List[nx.Graph]]:
    """Extracts NetworkX graphs for the backbone and sidechains.

    Returns:
        Tuple[nx.Graph, List[nx.Graph]]: A tuple containing the backbone graph and a list
            of sidechain graphs.
    """
    backbone_graph = self._graph.subgraph(self._backbone_nodes)
    sidechain_graphs = [
        self._graph.subgraph(nodes)
        for nodes in nx.connected_components(
            self._graph.subgraph(self._sidechain_nodes)
        )
    ]
    return [backbone_graph], sidechain_graphs
get_backbone_and_sidechain_molecules()

Extracts RDKit molecule objects for the backbone and sidechains.

Returns:

Type Description
Tuple[List[Mol], List[Mol]]

Tuple[List[Chem.Mol], List[Chem.Mol]]: A tuple containing a list with the backbone molecule and a list of sidechain molecules.

Source code in src/polymetrix/featurizers/polymer.py
def get_backbone_and_sidechain_molecules(
    self,
) -> Tuple[List[Chem.Mol], List[Chem.Mol]]:
    """Extracts RDKit molecule objects for the backbone and sidechains.

    Returns:
        Tuple[List[Chem.Mol], List[Chem.Mol]]: A tuple containing a list with the backbone
            molecule and a list of sidechain molecules.
    """
    backbone_mol = self._subgraph_to_mol(self._graph.subgraph(self._backbone_nodes))
    sidechain_mols = [
        self._subgraph_to_mol(self._graph.subgraph(nodes))
        for nodes in nx.connected_components(
            self._graph.subgraph(self._sidechain_nodes)
        )
    ]
    return [backbone_mol], sidechain_mols
get_connection_points()

Gets the list of connection point node indices.

Returns:

Type Description
List[int]

List[int]: List of node indices representing connection points.

Source code in src/polymetrix/featurizers/polymer.py
def get_connection_points(self) -> List[int]:
    """Gets the list of connection point node indices.

    Returns:
        List[int]: List of node indices representing connection points.
    """
    return self._connection_points

add_degree_one_nodes_to_backbone(graph, backbone)

Adds degree-1 nodes connected to backbone nodes to the backbone list.

Parameters:

Name Type Description Default
graph Graph

nx.Graph, the input graph to analyze.

required
backbone List[int]

List[int], the initial list of backbone node indices.

required

Returns:

Type Description
List[int]

List[int]: The updated backbone list including degree-1 nodes.

Source code in src/polymetrix/featurizers/polymer.py
def add_degree_one_nodes_to_backbone(graph: nx.Graph, backbone: List[int]) -> List[int]:
    """Adds degree-1 nodes connected to backbone nodes to the backbone list.

    Args:
        graph: nx.Graph, the input graph to analyze.
        backbone: List[int], the initial list of backbone node indices.

    Returns:
        List[int]: The updated backbone list including degree-1 nodes.
    """
    for node in list(graph.nodes):
        if graph.degree[node] == 1:
            neighbor = next(iter(graph.neighbors(node)))
            if neighbor in backbone:
                backbone.append(node)
    return backbone

classify_backbone_and_sidechains(graph)

Classifies nodes into backbone and sidechain components based on paths and cycles.

Parameters:

Name Type Description Default
graph Graph

nx.Graph, the input graph to classify.

required

Returns:

Type Description
Tuple[List[int], List[int]]

Tuple[List[int], List[int]]: A tuple containing the list of backbone nodes and the list of sidechain nodes.

Source code in src/polymetrix/featurizers/polymer.py
def classify_backbone_and_sidechains(graph: nx.Graph) -> Tuple[List[int], List[int]]:
    """Classifies nodes into backbone and sidechain components based on paths and cycles.

    Args:
        graph: nx.Graph, the input graph to classify.

    Returns:
        Tuple[List[int], List[int]]: A tuple containing the list of backbone nodes and
            the list of sidechain nodes.
    """
    shortest_paths = find_shortest_paths_between_stars(graph)
    cycles = find_cycles_including_paths(graph, shortest_paths)
    backbone_nodes = set()
    for cycle in cycles:
        for edge in cycle:
            backbone_nodes.update(edge)
    for path in shortest_paths:
        backbone_nodes.update(path)
    backbone_nodes = add_degree_one_nodes_to_backbone(graph, list(backbone_nodes))
    sidechain_nodes = [node for node in graph.nodes if node not in backbone_nodes]
    return list(set(backbone_nodes)), sidechain_nodes

find_cycles_including_paths(graph, paths)

Identifies cycles in the graph that include nodes from the given paths.

Parameters:

Name Type Description Default
graph Graph

nx.Graph, the input graph to analyze.

required
paths List[List[int]]

List[List[int]], list of paths whose nodes are used to filter cycles.

required

Returns:

Type Description
List[List[int]]

List[List[int]]: A list of unique cycles, where each cycle is a list of node indices.

Source code in src/polymetrix/featurizers/polymer.py
def find_cycles_including_paths(
    graph: nx.Graph, paths: List[List[int]]
) -> List[List[int]]:
    """Identifies cycles in the graph that include nodes from the given paths.

    Args:
        graph: nx.Graph, the input graph to analyze.
        paths: List[List[int]], list of paths whose nodes are used to filter cycles.

    Returns:
        List[List[int]]: A list of unique cycles, where each cycle is a list of node indices.
    """
    all_cycles = nx.cycle_basis(graph)
    path_nodes = {node for path in paths for node in path}
    cycles_including_paths = [
        cycle for cycle in all_cycles if any(node in path_nodes for node in cycle)
    ]
    unique_cycles = {
        tuple(sorted((min(c), max(c)) for c in zip(cycle, cycle[1:] + [cycle[0]])))
        for cycle in cycles_including_paths
    }
    return [list(cycle) for cycle in unique_cycles]

find_shortest_paths_between_stars(graph)

Finds shortest paths between all pairs of asterisk (*) nodes in the graph.

Parameters:

Name Type Description Default
graph Graph

nx.Graph, the input graph to analyze.

required

Returns:

Type Description
List[List[int]]

List[List[int]]: A list of shortest paths, where each path is a list of node indices.

Source code in src/polymetrix/featurizers/polymer.py
def find_shortest_paths_between_stars(graph: nx.Graph) -> List[List[int]]:
    """Finds shortest paths between all pairs of asterisk (*) nodes in the graph.

    Args:
        graph: nx.Graph, the input graph to analyze.

    Returns:
        List[List[int]]: A list of shortest paths, where each path is a list of node indices.
    """
    star_nodes = [
        node for node, data in graph.nodes(data=True) if data["element"] == "*"
    ]
    shortest_paths = []
    for i in range(len(star_nodes)):
        for j in range(i + 1, len(star_nodes)):
            try:
                path = nx.shortest_path(
                    graph, source=star_nodes[i], target=star_nodes[j]
                )
                shortest_paths.append(path)
            except nx.NetworkXNoPath:
                continue
    return shortest_paths

sidechain_backbone_featurizer

SidechainDiversityFeaturizer

Bases: BaseFeatureCalculator

Computes the number of structurally diverse sidechains in a polymer based on graph isomorphism.

Source code in src/polymetrix/featurizers/sidechain_backbone_featurizer.py
class SidechainDiversityFeaturizer(BaseFeatureCalculator):
    """Computes the number of structurally diverse sidechains in a polymer based on graph isomorphism."""

    def featurize(self, polymer) -> np.ndarray:
        sidechain_graphs = polymer.get_backbone_and_sidechain_graphs()[1]
        unique_hashes = set()
        for scg in sidechain_graphs:
            graph_hash = nx.weisfeiler_lehman_graph_hash(scg)
            unique_hashes.add(graph_hash)
        return np.array([len(unique_hashes)])

    def feature_labels(self) -> List[str]:
        return ["num_diverse_sidechains"]

SidechainLengthToStarAttachmentDistanceRatioFeaturizer

Bases: BaseFeatureCalculator

Computes aggregated ratios of sidechain lengths to the shortest backbone distance from the polymer's star node (*) to each sidechain's attachment point.

Source code in src/polymetrix/featurizers/sidechain_backbone_featurizer.py
class SidechainLengthToStarAttachmentDistanceRatioFeaturizer(BaseFeatureCalculator):
    """Computes aggregated ratios of sidechain lengths to the shortest backbone distance from the polymer's star node (*) to each sidechain's attachment point."""

    def _compute_min_backbone_length(self, sidechain, star_nodes, star_paths, graph):
        """Calculate the minimum backbone distance from any star node to the sidechain's attachment point."""
        min_backbone_length = float("inf")
        side_nodes = set(sidechain.nodes())
        for node in side_nodes:
            neighbors = set(graph.neighbors(node))
            backbone_neighbors = neighbors - side_nodes
            if backbone_neighbors:
                attachment_point = next(iter(backbone_neighbors))
                for star in star_nodes:
                    if attachment_point in star_paths[star]:
                        path_length = star_paths[star][attachment_point] + 1
                        min_backbone_length = min(min_backbone_length, path_length)
        return min_backbone_length

    def featurize(self, polymer) -> np.ndarray:
        graph = polymer.graph
        star_nodes = [
            node for node, data in graph.nodes(data=True) if data["element"] == "*"
        ]
        backbone_graphs, sidechain_graphs = polymer.get_backbone_and_sidechain_graphs()

        if not sidechain_graphs or not backbone_graphs:
            return np.zeros(len(self.agg))

        sidechain_lengths = [len(sc.nodes()) for sc in sidechain_graphs]
        star_paths = {
            star: nx.single_source_shortest_path_length(graph, star)
            for star in star_nodes
        }

        backbone_lengths = [
            self._compute_min_backbone_length(sidechain, star_nodes, star_paths, graph)
            for sidechain in sidechain_graphs
        ]

        ratios = [
            s_length / b_length
            for s_length, b_length in zip(sidechain_lengths, backbone_lengths)
            if b_length > 0
        ]
        if not ratios:
            return np.zeros(len(self.agg))

        agg_ratios = self.aggregate(ratios)
        return np.array(agg_ratios)

    def feature_base_labels(self) -> List[str]:
        return ["sidechainlength_to_star_attachment_distance_ratio"]

StarToSidechainMinDistanceFeaturizer

Bases: BaseFeatureCalculator

Computes aggregated minimum backbone distances from star nodes (*) to sidechains in a polymer.

Source code in src/polymetrix/featurizers/sidechain_backbone_featurizer.py
class StarToSidechainMinDistanceFeaturizer(BaseFeatureCalculator):
    """Computes aggregated minimum backbone distances from star nodes (*) to sidechains in a polymer."""

    def featurize(self, polymer) -> np.ndarray:
        graph = polymer.graph
        star_nodes = [
            node for node, data in graph.nodes(data=True) if data["element"] == "*"
        ]
        sidechain_graphs = polymer.get_backbone_and_sidechain_graphs()[1]

        distances = []
        for sidechain in sidechain_graphs:
            valid_dists = [
                nx.shortest_path_length(graph, star, node) - 1
                for star in star_nodes
                for node in sidechain.nodes()
                if nx.has_path(graph, star, node)
            ]
            if valid_dists:
                distances.append(min(valid_dists))

        if not distances:
            return np.zeros(len(self.agg))

        return self.aggregate(distances)

    def feature_base_labels(self) -> List[str]:
        return ["star_to_sidechain_min_distance"]

splitters

splitters

PolymerClassSplitter

Bases: BaseSplitter

Splitter based on polymer class

Source code in src/polymetrix/splitters/splitters.py
class PolymerClassSplitter(BaseSplitter):
    """Splitter based on polymer class"""

    def __init__(
        self,
        ds: AbstractDataset,
        column_name: str = "meta.polymer_class",
        shuffle: bool = True,
        random_state: Optional[Union[int, np.random.RandomState]] = None,
        **kwargs,
    ) -> None:
        self._column_name = column_name
        super().__init__(ds=ds, shuffle=shuffle, random_state=random_state, **kwargs)

    def _get_groups(self) -> Collection[str]:
        col_idx = self._ds._meta_names.index(self._column_name)
        metadata = self._ds._meta_data[:, col_idx]
        return metadata.flatten()

TgSplitter

Bases: BaseSplitter

Splitter based on Tg values

Source code in src/polymetrix/splitters/splitters.py
class TgSplitter(BaseSplitter):
    """Splitter based on Tg values"""

    def __init__(
        self,
        ds: AbstractDataset,
        tg_q: Optional[Collection[float]] = None,
        label_name: str = "labels.Exp_Tg(K)",
        shuffle: bool = True,
        random_state: Optional[Union[int, np.random.RandomState]] = None,
        **kwargs,
    ) -> None:
        """Initialize TgSplitter

        Args:
            ds: Dataset to split
            tg_q: Quantiles to bin Tg values into groups
            label_name: Name of the label to use for splitting
            shuffle: Whether to shuffle the dataset
            random_state: Random state for shuffling
            **kwargs: Additional arguments to pass to BaseSplitter
        """
        self._grouping_q = tg_q
        self._label_name = label_name
        super().__init__(ds=ds, shuffle=shuffle, random_state=random_state, **kwargs)

    def _get_groups(self) -> Collection[int]:
        """Bin Tg values into quantile-based groups"""
        tg_values = self._ds.get_labels(
            idx=range(len(self._ds)), label_names=[self._label_name]
        ).flatten()
        return quantile_binning(tg_values, self._grouping_q)
__init__(ds, tg_q=None, label_name='labels.Exp_Tg(K)', shuffle=True, random_state=None, **kwargs)

Initialize TgSplitter

Parameters:

Name Type Description Default
ds AbstractDataset

Dataset to split

required
tg_q Optional[Collection[float]]

Quantiles to bin Tg values into groups

None
label_name str

Name of the label to use for splitting

'labels.Exp_Tg(K)'
shuffle bool

Whether to shuffle the dataset

True
random_state Optional[Union[int, RandomState]]

Random state for shuffling

None
**kwargs

Additional arguments to pass to BaseSplitter

{}
Source code in src/polymetrix/splitters/splitters.py
def __init__(
    self,
    ds: AbstractDataset,
    tg_q: Optional[Collection[float]] = None,
    label_name: str = "labels.Exp_Tg(K)",
    shuffle: bool = True,
    random_state: Optional[Union[int, np.random.RandomState]] = None,
    **kwargs,
) -> None:
    """Initialize TgSplitter

    Args:
        ds: Dataset to split
        tg_q: Quantiles to bin Tg values into groups
        label_name: Name of the label to use for splitting
        shuffle: Whether to shuffle the dataset
        random_state: Random state for shuffling
        **kwargs: Additional arguments to pass to BaseSplitter
    """
    self._grouping_q = tg_q
    self._label_name = label_name
    super().__init__(ds=ds, shuffle=shuffle, random_state=random_state, **kwargs)