Skip to content

API Reference

datasets

AbstractDataset

Bases: ABC

Base class for polymer datasets.

Source code in src/polymetrix/datasets/dataset.py
class AbstractDataset(ABC):
    """Base class for polymer datasets."""

    def __init__(self):
        """Initialize a dataset."""
        self._meta_data = None
        self._features = None
        self._labels = None
        self._psmiles = None
        self._feature_names = []
        self._label_names = []
        self._meta_names = []

    @abstractmethod
    def _load_data(self, subset: Optional[Collection[int]] = None):
        """Load and prepare the dataset-specific data.

        Args:
            subset (Optional[Collection[int]]): Indices to include in the dataset.
        """
        pass

    def get_subset(self, indices: Collection[int]) -> "AbstractDataset":
        """Get a subset of the dataset."""
        if not all(0 <= i < len(self) for i in indices):
            raise IndexError("Indices out of bounds.")
        subset = self.__class__()
        subset._features = self._features[indices]
        subset._labels = self._labels[indices]
        subset._meta_data = self._meta_data[indices]
        subset._psmiles = self._psmiles[indices] if self._psmiles is not None else None
        subset._feature_names = self._feature_names.copy()
        subset._label_names = self._label_names.copy()
        subset._meta_names = self._meta_names.copy()
        return subset

    @property
    def available_features(self) -> list[str]:
        """List of available features.
        Returns:
            list[str]: List of feature names
        """
        return self._feature_names

    @property
    def available_labels(self) -> list[str]:
        """List of available labels.
        Returns:
            list[str]: List of label names
        """
        return self._label_names

    @property
    def meta_info(self) -> list[str]:
        """List of available metadata fields.
        Returns:
            list[str]: List of metadata field names
        """
        return self._meta_names

    @property
    def psmiles(self) -> np.ndarray:
        """Return the polymer SMILES strings.
        Returns:
            np.ndarray: Array of polymer SMILES strings
        """
        return self._psmiles

    def __len__(self):
        """Return the number of entries in the dataset."""
        return len(self._features) if self._features is not None else 0

    def __iter__(self):
        """Iterate over the features in the dataset."""
        return iter(self._features)

    def get_features(
        self, idx: Collection[int], feature_names: Optional[Collection[str]] = None
    ) -> np.ndarray:
        """Get features for specified indices.
        Args:
            idx (Collection[int]): Indices of entries.
            feature_names (Optional[Collection[str]]): Names of features to return.
            If None, returns all available features.
        Returns:
            np.ndarray: Array of feature values.
        """
        if feature_names is None:
            return self._features[np.array(idx)]
        col_indices = [self._feature_names.index(name) for name in feature_names]
        return self._features[np.array(idx)][:, col_indices]

    def get_labels(
        self, idx: Collection[int], label_names: Optional[Collection[str]] = None
    ) -> np.ndarray:
        """Get labels for specified indices.
        Args:
            idx (Collection[int]): Indices of entries.
            label_names (Optional[Collection[str]]): Names of labels to return.
            If None, returns all available labels.
        Returns:
            np.ndarray: Array of label values.
        """
        if label_names is None:
            return self._labels[np.array(idx)]
        col_indices = [self._label_names.index(name) for name in label_names]
        return self._labels[np.array(idx)][:, col_indices]

    def get_meta(
        self, idx: Collection[int], meta_keys: Optional[Collection[str]] = None
    ) -> np.ndarray:
        """Get metadata for specified indices.
        Args:
            idx (Collection[int]): Indices of entries.
            meta_keys (Optional[Collection[str]]): Names of metadata fields to return.
            If None, returns all available metadata.

        Returns:
            np.ndarray: Array of metadata values.
        """
        if meta_keys is None:
            return self._meta_data[np.array(idx)]
        col_indices = [self._meta_names.index(name) for name in meta_keys]
        return self._meta_data[np.array(idx)][:, col_indices]

available_features property

List of available features. Returns: list[str]: List of feature names

available_labels property

List of available labels. Returns: list[str]: List of label names

meta_info property

List of available metadata fields. Returns: list[str]: List of metadata field names

psmiles property

Return the polymer SMILES strings. Returns: np.ndarray: Array of polymer SMILES strings

__init__()

Initialize a dataset.

Source code in src/polymetrix/datasets/dataset.py
def __init__(self):
    """Initialize a dataset."""
    self._meta_data = None
    self._features = None
    self._labels = None
    self._psmiles = None
    self._feature_names = []
    self._label_names = []
    self._meta_names = []

__iter__()

Iterate over the features in the dataset.

Source code in src/polymetrix/datasets/dataset.py
def __iter__(self):
    """Iterate over the features in the dataset."""
    return iter(self._features)

__len__()

Return the number of entries in the dataset.

Source code in src/polymetrix/datasets/dataset.py
def __len__(self):
    """Return the number of entries in the dataset."""
    return len(self._features) if self._features is not None else 0

get_features(idx, feature_names=None)

Get features for specified indices. Args: idx (Collection[int]): Indices of entries. feature_names (Optional[Collection[str]]): Names of features to return. If None, returns all available features. Returns: np.ndarray: Array of feature values.

Source code in src/polymetrix/datasets/dataset.py
def get_features(
    self, idx: Collection[int], feature_names: Optional[Collection[str]] = None
) -> np.ndarray:
    """Get features for specified indices.
    Args:
        idx (Collection[int]): Indices of entries.
        feature_names (Optional[Collection[str]]): Names of features to return.
        If None, returns all available features.
    Returns:
        np.ndarray: Array of feature values.
    """
    if feature_names is None:
        return self._features[np.array(idx)]
    col_indices = [self._feature_names.index(name) for name in feature_names]
    return self._features[np.array(idx)][:, col_indices]

get_labels(idx, label_names=None)

Get labels for specified indices. Args: idx (Collection[int]): Indices of entries. label_names (Optional[Collection[str]]): Names of labels to return. If None, returns all available labels. Returns: np.ndarray: Array of label values.

Source code in src/polymetrix/datasets/dataset.py
def get_labels(
    self, idx: Collection[int], label_names: Optional[Collection[str]] = None
) -> np.ndarray:
    """Get labels for specified indices.
    Args:
        idx (Collection[int]): Indices of entries.
        label_names (Optional[Collection[str]]): Names of labels to return.
        If None, returns all available labels.
    Returns:
        np.ndarray: Array of label values.
    """
    if label_names is None:
        return self._labels[np.array(idx)]
    col_indices = [self._label_names.index(name) for name in label_names]
    return self._labels[np.array(idx)][:, col_indices]

get_meta(idx, meta_keys=None)

Get metadata for specified indices. Args: idx (Collection[int]): Indices of entries. meta_keys (Optional[Collection[str]]): Names of metadata fields to return. If None, returns all available metadata.

Returns:

Type Description
ndarray

np.ndarray: Array of metadata values.

Source code in src/polymetrix/datasets/dataset.py
def get_meta(
    self, idx: Collection[int], meta_keys: Optional[Collection[str]] = None
) -> np.ndarray:
    """Get metadata for specified indices.
    Args:
        idx (Collection[int]): Indices of entries.
        meta_keys (Optional[Collection[str]]): Names of metadata fields to return.
        If None, returns all available metadata.

    Returns:
        np.ndarray: Array of metadata values.
    """
    if meta_keys is None:
        return self._meta_data[np.array(idx)]
    col_indices = [self._meta_names.index(name) for name in meta_keys]
    return self._meta_data[np.array(idx)][:, col_indices]

get_subset(indices)

Get a subset of the dataset.

Source code in src/polymetrix/datasets/dataset.py
def get_subset(self, indices: Collection[int]) -> "AbstractDataset":
    """Get a subset of the dataset."""
    if not all(0 <= i < len(self) for i in indices):
        raise IndexError("Indices out of bounds.")
    subset = self.__class__()
    subset._features = self._features[indices]
    subset._labels = self._labels[indices]
    subset._meta_data = self._meta_data[indices]
    subset._psmiles = self._psmiles[indices] if self._psmiles is not None else None
    subset._feature_names = self._feature_names.copy()
    subset._label_names = self._label_names.copy()
    subset._meta_names = self._meta_names.copy()
    return subset

CuratedGlassTempDataset

Bases: AbstractDataset

Dataset for polymer glass transition temperature (Tg) data.

Source code in src/polymetrix/datasets/curated_tg_dataset.py
class CuratedGlassTempDataset(AbstractDataset):
    """Dataset for polymer glass transition temperature (Tg) data."""

    ALL_FEATURE_LEVELS = [
        "sidechainlevel",
        "backbonelevel",
        "fullpolymerlevel",
    ]
    FEATURE_PREFIX = "features."
    LABEL_PREFIX = "labels."
    META_PREFIX = "meta."

    DEFAULT_VERSION = "v1"
    DEFAULT_URL = "https://zenodo.org/records/15210035/files/LAMALAB_CURATED_Tg_structured_polymerclass.csv?download=1"

    def __init__(
        self,
        feature_levels: List[str] = ALL_FEATURE_LEVELS,
        subset: Optional[Collection[int]] = None,
    ):
        """Initialize the Tg dataset.
        Args:
           feature_levels (List[str]): Feature levels to include
           subset (Optional[Collection[int]]): Indices to include in the dataset
        """
        super().__init__()
        self._version = self.DEFAULT_VERSION
        self._url = self.DEFAULT_URL
        self._feature_levels = feature_levels

        # Validate feature levels using set operations
        if not set(self._feature_levels).issubset(self.ALL_FEATURE_LEVELS):
            raise ValueError(
                f"feature_levels must be a subset of {self.ALL_FEATURE_LEVELS}, "
                f"got {self._feature_levels}"
            )

        self._load_data(subset)

    def _load_data(self, subset: Optional[Collection[int]] = None):
        """Load and prepare the dataset."""
        csv_path = POLYMETRIX_PYSTOW_MODULE.ensure(
            "CuratedGlassTempDataset",
            self._version,
            url=self._url,
        )
        self._df = pd.read_csv(str(csv_path)).reset_index(drop=True)

        if subset is not None:
            self._df = self._df.iloc[subset].reset_index(drop=True)

        self._psmiles = self._df["PSMILES"].to_numpy()

        allowed_prefixes = [
            f"{level}.{self.FEATURE_PREFIX}" for level in self._feature_levels
        ]
        self._feature_names = self._filter_columns(allowed_prefixes)

        self._label_names = self._filter_columns([self.LABEL_PREFIX])
        self._meta_names = self._filter_columns([self.META_PREFIX])

        self._features = self._df[self._feature_names].to_numpy()
        self._labels = self._df[self._label_names].to_numpy()
        self._meta_data = self._df[self._meta_names].to_numpy()

    def _filter_columns(self, prefixes: List[str]) -> List[str]:
        """Helper to filter columns by prefix(es)."""
        return [
            col
            for col in self._df.columns
            if any(col.startswith(prefix) for prefix in prefixes)
        ]

    @property
    def df(self) -> pd.DataFrame:
        return self._df

    @property
    def active_feature_levels(self) -> List[str]:
        return self._feature_levels

    def get_subset(self, indices: Collection[int]) -> "CuratedGlassTempDataset":
        return CuratedGlassTempDataset(
            feature_levels=self._feature_levels,
            subset=indices,
        )

__init__(feature_levels=ALL_FEATURE_LEVELS, subset=None)

Initialize the Tg dataset. Args: feature_levels (List[str]): Feature levels to include subset (Optional[Collection[int]]): Indices to include in the dataset

Source code in src/polymetrix/datasets/curated_tg_dataset.py
def __init__(
    self,
    feature_levels: List[str] = ALL_FEATURE_LEVELS,
    subset: Optional[Collection[int]] = None,
):
    """Initialize the Tg dataset.
    Args:
       feature_levels (List[str]): Feature levels to include
       subset (Optional[Collection[int]]): Indices to include in the dataset
    """
    super().__init__()
    self._version = self.DEFAULT_VERSION
    self._url = self.DEFAULT_URL
    self._feature_levels = feature_levels

    # Validate feature levels using set operations
    if not set(self._feature_levels).issubset(self.ALL_FEATURE_LEVELS):
        raise ValueError(
            f"feature_levels must be a subset of {self.ALL_FEATURE_LEVELS}, "
            f"got {self._feature_levels}"
        )

    self._load_data(subset)

curated_tg_dataset

CuratedGlassTempDataset

Bases: AbstractDataset

Dataset for polymer glass transition temperature (Tg) data.

Source code in src/polymetrix/datasets/curated_tg_dataset.py
class CuratedGlassTempDataset(AbstractDataset):
    """Dataset for polymer glass transition temperature (Tg) data."""

    ALL_FEATURE_LEVELS = [
        "sidechainlevel",
        "backbonelevel",
        "fullpolymerlevel",
    ]
    FEATURE_PREFIX = "features."
    LABEL_PREFIX = "labels."
    META_PREFIX = "meta."

    DEFAULT_VERSION = "v1"
    DEFAULT_URL = "https://zenodo.org/records/15210035/files/LAMALAB_CURATED_Tg_structured_polymerclass.csv?download=1"

    def __init__(
        self,
        feature_levels: List[str] = ALL_FEATURE_LEVELS,
        subset: Optional[Collection[int]] = None,
    ):
        """Initialize the Tg dataset.
        Args:
           feature_levels (List[str]): Feature levels to include
           subset (Optional[Collection[int]]): Indices to include in the dataset
        """
        super().__init__()
        self._version = self.DEFAULT_VERSION
        self._url = self.DEFAULT_URL
        self._feature_levels = feature_levels

        # Validate feature levels using set operations
        if not set(self._feature_levels).issubset(self.ALL_FEATURE_LEVELS):
            raise ValueError(
                f"feature_levels must be a subset of {self.ALL_FEATURE_LEVELS}, "
                f"got {self._feature_levels}"
            )

        self._load_data(subset)

    def _load_data(self, subset: Optional[Collection[int]] = None):
        """Load and prepare the dataset."""
        csv_path = POLYMETRIX_PYSTOW_MODULE.ensure(
            "CuratedGlassTempDataset",
            self._version,
            url=self._url,
        )
        self._df = pd.read_csv(str(csv_path)).reset_index(drop=True)

        if subset is not None:
            self._df = self._df.iloc[subset].reset_index(drop=True)

        self._psmiles = self._df["PSMILES"].to_numpy()

        allowed_prefixes = [
            f"{level}.{self.FEATURE_PREFIX}" for level in self._feature_levels
        ]
        self._feature_names = self._filter_columns(allowed_prefixes)

        self._label_names = self._filter_columns([self.LABEL_PREFIX])
        self._meta_names = self._filter_columns([self.META_PREFIX])

        self._features = self._df[self._feature_names].to_numpy()
        self._labels = self._df[self._label_names].to_numpy()
        self._meta_data = self._df[self._meta_names].to_numpy()

    def _filter_columns(self, prefixes: List[str]) -> List[str]:
        """Helper to filter columns by prefix(es)."""
        return [
            col
            for col in self._df.columns
            if any(col.startswith(prefix) for prefix in prefixes)
        ]

    @property
    def df(self) -> pd.DataFrame:
        return self._df

    @property
    def active_feature_levels(self) -> List[str]:
        return self._feature_levels

    def get_subset(self, indices: Collection[int]) -> "CuratedGlassTempDataset":
        return CuratedGlassTempDataset(
            feature_levels=self._feature_levels,
            subset=indices,
        )
__init__(feature_levels=ALL_FEATURE_LEVELS, subset=None)

Initialize the Tg dataset. Args: feature_levels (List[str]): Feature levels to include subset (Optional[Collection[int]]): Indices to include in the dataset

Source code in src/polymetrix/datasets/curated_tg_dataset.py
def __init__(
    self,
    feature_levels: List[str] = ALL_FEATURE_LEVELS,
    subset: Optional[Collection[int]] = None,
):
    """Initialize the Tg dataset.
    Args:
       feature_levels (List[str]): Feature levels to include
       subset (Optional[Collection[int]]): Indices to include in the dataset
    """
    super().__init__()
    self._version = self.DEFAULT_VERSION
    self._url = self.DEFAULT_URL
    self._feature_levels = feature_levels

    # Validate feature levels using set operations
    if not set(self._feature_levels).issubset(self.ALL_FEATURE_LEVELS):
        raise ValueError(
            f"feature_levels must be a subset of {self.ALL_FEATURE_LEVELS}, "
            f"got {self._feature_levels}"
        )

    self._load_data(subset)

dataset

AbstractDataset

Bases: ABC

Base class for polymer datasets.

Source code in src/polymetrix/datasets/dataset.py
class AbstractDataset(ABC):
    """Base class for polymer datasets."""

    def __init__(self):
        """Initialize a dataset."""
        self._meta_data = None
        self._features = None
        self._labels = None
        self._psmiles = None
        self._feature_names = []
        self._label_names = []
        self._meta_names = []

    @abstractmethod
    def _load_data(self, subset: Optional[Collection[int]] = None):
        """Load and prepare the dataset-specific data.

        Args:
            subset (Optional[Collection[int]]): Indices to include in the dataset.
        """
        pass

    def get_subset(self, indices: Collection[int]) -> "AbstractDataset":
        """Get a subset of the dataset."""
        if not all(0 <= i < len(self) for i in indices):
            raise IndexError("Indices out of bounds.")
        subset = self.__class__()
        subset._features = self._features[indices]
        subset._labels = self._labels[indices]
        subset._meta_data = self._meta_data[indices]
        subset._psmiles = self._psmiles[indices] if self._psmiles is not None else None
        subset._feature_names = self._feature_names.copy()
        subset._label_names = self._label_names.copy()
        subset._meta_names = self._meta_names.copy()
        return subset

    @property
    def available_features(self) -> list[str]:
        """List of available features.
        Returns:
            list[str]: List of feature names
        """
        return self._feature_names

    @property
    def available_labels(self) -> list[str]:
        """List of available labels.
        Returns:
            list[str]: List of label names
        """
        return self._label_names

    @property
    def meta_info(self) -> list[str]:
        """List of available metadata fields.
        Returns:
            list[str]: List of metadata field names
        """
        return self._meta_names

    @property
    def psmiles(self) -> np.ndarray:
        """Return the polymer SMILES strings.
        Returns:
            np.ndarray: Array of polymer SMILES strings
        """
        return self._psmiles

    def __len__(self):
        """Return the number of entries in the dataset."""
        return len(self._features) if self._features is not None else 0

    def __iter__(self):
        """Iterate over the features in the dataset."""
        return iter(self._features)

    def get_features(
        self, idx: Collection[int], feature_names: Optional[Collection[str]] = None
    ) -> np.ndarray:
        """Get features for specified indices.
        Args:
            idx (Collection[int]): Indices of entries.
            feature_names (Optional[Collection[str]]): Names of features to return.
            If None, returns all available features.
        Returns:
            np.ndarray: Array of feature values.
        """
        if feature_names is None:
            return self._features[np.array(idx)]
        col_indices = [self._feature_names.index(name) for name in feature_names]
        return self._features[np.array(idx)][:, col_indices]

    def get_labels(
        self, idx: Collection[int], label_names: Optional[Collection[str]] = None
    ) -> np.ndarray:
        """Get labels for specified indices.
        Args:
            idx (Collection[int]): Indices of entries.
            label_names (Optional[Collection[str]]): Names of labels to return.
            If None, returns all available labels.
        Returns:
            np.ndarray: Array of label values.
        """
        if label_names is None:
            return self._labels[np.array(idx)]
        col_indices = [self._label_names.index(name) for name in label_names]
        return self._labels[np.array(idx)][:, col_indices]

    def get_meta(
        self, idx: Collection[int], meta_keys: Optional[Collection[str]] = None
    ) -> np.ndarray:
        """Get metadata for specified indices.
        Args:
            idx (Collection[int]): Indices of entries.
            meta_keys (Optional[Collection[str]]): Names of metadata fields to return.
            If None, returns all available metadata.

        Returns:
            np.ndarray: Array of metadata values.
        """
        if meta_keys is None:
            return self._meta_data[np.array(idx)]
        col_indices = [self._meta_names.index(name) for name in meta_keys]
        return self._meta_data[np.array(idx)][:, col_indices]
available_features property

List of available features. Returns: list[str]: List of feature names

available_labels property

List of available labels. Returns: list[str]: List of label names

meta_info property

List of available metadata fields. Returns: list[str]: List of metadata field names

psmiles property

Return the polymer SMILES strings. Returns: np.ndarray: Array of polymer SMILES strings

__init__()

Initialize a dataset.

Source code in src/polymetrix/datasets/dataset.py
def __init__(self):
    """Initialize a dataset."""
    self._meta_data = None
    self._features = None
    self._labels = None
    self._psmiles = None
    self._feature_names = []
    self._label_names = []
    self._meta_names = []
__iter__()

Iterate over the features in the dataset.

Source code in src/polymetrix/datasets/dataset.py
def __iter__(self):
    """Iterate over the features in the dataset."""
    return iter(self._features)
__len__()

Return the number of entries in the dataset.

Source code in src/polymetrix/datasets/dataset.py
def __len__(self):
    """Return the number of entries in the dataset."""
    return len(self._features) if self._features is not None else 0
get_features(idx, feature_names=None)

Get features for specified indices. Args: idx (Collection[int]): Indices of entries. feature_names (Optional[Collection[str]]): Names of features to return. If None, returns all available features. Returns: np.ndarray: Array of feature values.

Source code in src/polymetrix/datasets/dataset.py
def get_features(
    self, idx: Collection[int], feature_names: Optional[Collection[str]] = None
) -> np.ndarray:
    """Get features for specified indices.
    Args:
        idx (Collection[int]): Indices of entries.
        feature_names (Optional[Collection[str]]): Names of features to return.
        If None, returns all available features.
    Returns:
        np.ndarray: Array of feature values.
    """
    if feature_names is None:
        return self._features[np.array(idx)]
    col_indices = [self._feature_names.index(name) for name in feature_names]
    return self._features[np.array(idx)][:, col_indices]
get_labels(idx, label_names=None)

Get labels for specified indices. Args: idx (Collection[int]): Indices of entries. label_names (Optional[Collection[str]]): Names of labels to return. If None, returns all available labels. Returns: np.ndarray: Array of label values.

Source code in src/polymetrix/datasets/dataset.py
def get_labels(
    self, idx: Collection[int], label_names: Optional[Collection[str]] = None
) -> np.ndarray:
    """Get labels for specified indices.
    Args:
        idx (Collection[int]): Indices of entries.
        label_names (Optional[Collection[str]]): Names of labels to return.
        If None, returns all available labels.
    Returns:
        np.ndarray: Array of label values.
    """
    if label_names is None:
        return self._labels[np.array(idx)]
    col_indices = [self._label_names.index(name) for name in label_names]
    return self._labels[np.array(idx)][:, col_indices]
get_meta(idx, meta_keys=None)

Get metadata for specified indices. Args: idx (Collection[int]): Indices of entries. meta_keys (Optional[Collection[str]]): Names of metadata fields to return. If None, returns all available metadata.

Returns:

Type Description
ndarray

np.ndarray: Array of metadata values.

Source code in src/polymetrix/datasets/dataset.py
def get_meta(
    self, idx: Collection[int], meta_keys: Optional[Collection[str]] = None
) -> np.ndarray:
    """Get metadata for specified indices.
    Args:
        idx (Collection[int]): Indices of entries.
        meta_keys (Optional[Collection[str]]): Names of metadata fields to return.
        If None, returns all available metadata.

    Returns:
        np.ndarray: Array of metadata values.
    """
    if meta_keys is None:
        return self._meta_data[np.array(idx)]
    col_indices = [self._meta_names.index(name) for name in meta_keys]
    return self._meta_data[np.array(idx)][:, col_indices]
get_subset(indices)

Get a subset of the dataset.

Source code in src/polymetrix/datasets/dataset.py
def get_subset(self, indices: Collection[int]) -> "AbstractDataset":
    """Get a subset of the dataset."""
    if not all(0 <= i < len(self) for i in indices):
        raise IndexError("Indices out of bounds.")
    subset = self.__class__()
    subset._features = self._features[indices]
    subset._labels = self._labels[indices]
    subset._meta_data = self._meta_data[indices]
    subset._psmiles = self._psmiles[indices] if self._psmiles is not None else None
    subset._feature_names = self._feature_names.copy()
    subset._label_names = self._label_names.copy()
    subset._meta_names = self._meta_names.copy()
    return subset

featurizers

base_featurizer

BaseFeatureCalculator

Source code in src/polymetrix/featurizers/base_featurizer.py
class BaseFeatureCalculator:
    agg_funcs = {
        "mean": np.mean,
        "min": np.min,
        "max": np.max,
        "sum": np.sum,
    }

    def __init__(self, agg: List[str] = None):
        if agg is None:
            agg = ["sum"]
        self.agg = agg

    def _sanitize(self, mol: Chem.Mol, sanitize: bool) -> None:
        """Handle molecule sanitization with kekulization exception handling."""
        if sanitize:
            try:
                Chem.SanitizeMol(
                    mol, sanitizeOps=Chem.SANITIZE_ALL ^ Chem.SANITIZE_KEKULIZE
                )
            except Chem.AtomKekulizeException:
                mol.UpdatePropertyCache()

    def calculate(self, mol: Chem.Mol) -> np.ndarray:
        raise NotImplementedError("Calculate method must be implemented by subclasses")

    def feature_base_labels(self) -> List[str]:
        raise NotImplementedError(
            "Feature labels method must be implemented by subclasses"
        )

    def feature_labels(self) -> List[str]:
        return [
            f"{label}_{agg}" for label in self.feature_base_labels() for agg in self.agg
        ]

    def aggregate(self, features: List) -> np.ndarray:
        """
        Aggregates a list of features using the aggregation functions specified in self.agg.
        If the features are numpy arrays, the aggregation is applied along the first axis.
        Otherwise, the aggregation is applied directly (assuming the features are scalar numeric values).
        """
        results = []
        if not features:
            return np.array([])

        # Check whether features are numpy arrays by testing the first element.
        first_elem = features[0]
        if isinstance(first_elem, np.ndarray):
            for agg_func in self.agg:
                if agg_func not in self.agg_funcs:
                    raise ValueError(f"Unknown aggregation function: {agg_func}")
                aggregated = self.agg_funcs[agg_func](features, axis=0)
                results.append(aggregated)
            return np.concatenate(results)
        else:
            for agg_func in self.agg:
                if agg_func not in self.agg_funcs:
                    raise ValueError(f"Unknown aggregation function: {agg_func}")
                results.append(self.agg_funcs[agg_func](features))
            return np.array(results)

    def get_feature_names(self) -> List[str]:
        raise NotImplementedError(
            "Get feature name method must be implemented by subclasses"
        )

    def citations(self) -> List[str]:
        return []

    def implementors(self) -> List[str]:
        return []
aggregate(features)

Aggregates a list of features using the aggregation functions specified in self.agg. If the features are numpy arrays, the aggregation is applied along the first axis. Otherwise, the aggregation is applied directly (assuming the features are scalar numeric values).

Source code in src/polymetrix/featurizers/base_featurizer.py
def aggregate(self, features: List) -> np.ndarray:
    """
    Aggregates a list of features using the aggregation functions specified in self.agg.
    If the features are numpy arrays, the aggregation is applied along the first axis.
    Otherwise, the aggregation is applied directly (assuming the features are scalar numeric values).
    """
    results = []
    if not features:
        return np.array([])

    # Check whether features are numpy arrays by testing the first element.
    first_elem = features[0]
    if isinstance(first_elem, np.ndarray):
        for agg_func in self.agg:
            if agg_func not in self.agg_funcs:
                raise ValueError(f"Unknown aggregation function: {agg_func}")
            aggregated = self.agg_funcs[agg_func](features, axis=0)
            results.append(aggregated)
        return np.concatenate(results)
    else:
        for agg_func in self.agg:
            if agg_func not in self.agg_funcs:
                raise ValueError(f"Unknown aggregation function: {agg_func}")
            results.append(self.agg_funcs[agg_func](features))
        return np.array(results)

MoleculeFeaturizer

Base class for featurizers that work with general molecules.

Source code in src/polymetrix/featurizers/base_featurizer.py
class MoleculeFeaturizer:
    """Base class for featurizers that work with general molecules."""

    def __init__(self, calculator: Optional[BaseFeatureCalculator] = None):
        self.calculator = calculator

    def featurize(self, molecule) -> np.ndarray:
        raise NotImplementedError("Featurize method must be implemented by subclasses")

    def feature_labels(self) -> List[str]:
        if self.calculator:
            return [
                f"{label}_{self.__class__.__name__.lower()}"
                for label in self.calculator.feature_labels()
            ]
        else:
            return [self.__class__.__name__.lower()]

chemical_featurizer

BalabanJIndex

Bases: GenericScalarFeaturizer

Measures molecular complexity and connectivity of atoms.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class BalabanJIndex(GenericScalarFeaturizer):
    """
    Measures molecular complexity and connectivity of atoms.
    """

    def __init__(self, agg: List[str] = None):
        super().__init__(GraphDescriptors.BalabanJ, "balaban_j_index", agg=agg)

BondCounts

Bases: BaseFeatureCalculator

Counts the number of single, double, and triple bonds in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class BondCounts(BaseFeatureCalculator):
    """
    Counts the number of single, double, and triple bonds in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        bond_types = {
            Chem.BondType.SINGLE: 0,
            Chem.BondType.DOUBLE: 0,
            Chem.BondType.TRIPLE: 0,
        }
        for bond in mol.GetBonds():
            if bond.GetBondType() in bond_types:
                bond_types[bond.GetBondType()] += 1
        return np.array(
            [
                bond_types[Chem.BondType.SINGLE],
                bond_types[Chem.BondType.DOUBLE],
                bond_types[Chem.BondType.TRIPLE],
            ]
        )

    def feature_base_labels(self) -> List[str]:
        return ["single_bonds", "double_bonds", "triple_bonds"]

BridgingRingsCount

Bases: BaseFeatureCalculator

Counts the number of bridging rings in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class BridgingRingsCount(BaseFeatureCalculator):
    """
    Counts the number of bridging rings in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        ring_info = mol.GetRingInfo()
        rings = ring_info.AtomRings()
        bridging_rings = 0

        for i in range(len(rings)):
            for j in range(i + 1, len(rings)):
                if len(set(rings[i]) & set(rings[j])) >= 2:
                    bridging_rings += 1
                    break

        return np.array([bridging_rings])

    def feature_base_labels(self) -> List[str]:
        return ["bridging_rings_count"]

FpDensityMorgan1

Bases: GenericScalarFeaturizer

Calculates the density of the Morgan1 fingerprint.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class FpDensityMorgan1(GenericScalarFeaturizer):
    """
    Calculates the density of the Morgan1 fingerprint.
    """

    def __init__(self, agg: List[str] = None):
        super().__init__(Descriptors.FpDensityMorgan1, "fp_density_morgan1", agg=agg)

FractionBicyclicRings

Bases: BaseFeatureCalculator

Calculates the fraction of bicyclic rings in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class FractionBicyclicRings(BaseFeatureCalculator):
    """
    Calculates the fraction of bicyclic rings in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        ring_info = mol.GetRingInfo()
        atom_rings = ring_info.AtomRings()
        bicyclic_count = sum(
            1
            for i, ring1 in enumerate(atom_rings)
            for ring2 in atom_rings[i + 1 :]
            if set(ring1) & set(ring2)
        )
        return np.array([bicyclic_count / len(atom_rings) if atom_rings else 0])

    def feature_base_labels(self) -> List[str]:
        return ["fraction_bicyclic_rings"]

HalogenCounts

Bases: BaseFeatureCalculator

Counts the number of halogen atoms in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class HalogenCounts(BaseFeatureCalculator):
    """
    Counts the number of halogen atoms in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        halogen_counts = {9: 0, 17: 0, 35: 0, 53: 0}  # F, Cl, Br, I
        for atom in mol.GetAtoms():
            atomic_num = atom.GetAtomicNum()
            if atomic_num in halogen_counts:
                halogen_counts[atomic_num] += 1

        total_halogens = sum(halogen_counts.values())

        return np.array(
            [
                total_halogens,
                halogen_counts[9],
                halogen_counts[17],
                halogen_counts[35],
                halogen_counts[53],
            ]
        )

    def feature_base_labels(self) -> List[str]:
        return [
            "total_halogens",
            "fluorine_count",
            "chlorine_count",
            "bromine_count",
            "iodine_count",
        ]

HeteroatomCount

Bases: BaseFeatureCalculator

Counts heteroatoms (non-C, non-H) in heterocyclic rings.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class HeteroatomCount(BaseFeatureCalculator):
    """
    Counts heteroatoms (non-C, non-H) in heterocyclic rings.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        return np.array([sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() != 6)])

    def feature_base_labels(self) -> List[str]:
        return ["heteroatom_count"]

HeteroatomDensity

Bases: BaseFeatureCalculator

Density of heteroatoms in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class HeteroatomDensity(BaseFeatureCalculator):
    """
    Density of heteroatoms in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        num_atoms = mol.GetNumAtoms()
        num_heteroatoms = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() != 6)
        return np.array([num_heteroatoms / num_atoms if num_atoms else 0])

    def feature_base_labels(self) -> List[str]:
        return ["heteroatom_density"]

MaxEStateIndex

Bases: GenericScalarFeaturizer

Maximum electronic state index, reflecting charge distribution.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class MaxEStateIndex(GenericScalarFeaturizer):
    """
    Maximum electronic state index, reflecting charge distribution.
    """

    def __init__(self, agg: List[str] = None):
        super().__init__(Descriptors.MaxEStateIndex, "max_estate_index", agg=agg)

MaxRingSize

Bases: BaseFeatureCalculator

Calculates the size of the largest ring in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class MaxRingSize(BaseFeatureCalculator):
    """
    Calculates the size of the largest ring in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        rings = mol.GetRingInfo().AtomRings()
        return np.array([max(map(len, rings)) if rings else 0])

    def feature_base_labels(self) -> List[str]:
        return ["max_ring_size"]

MolecularWeight

Bases: GenericScalarFeaturizer

Calculates the molecular weight of the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class MolecularWeight(GenericScalarFeaturizer):
    """
    Calculates the molecular weight of the molecule.
    """

    def __init__(self, agg: List[str] = None):
        super().__init__(Descriptors.ExactMolWt, "molecular_weight", agg=agg)

NumAliphaticHeterocycles

Bases: BaseFeatureCalculator

Counts the number of aliphatic heterocycles in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class NumAliphaticHeterocycles(BaseFeatureCalculator):
    """
    Counts the number of aliphatic heterocycles in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        num_heterocycles = 0
        for ring in mol.GetRingInfo().AtomRings():
            if any(mol.GetAtomWithIdx(atom).GetAtomicNum() != 6 for atom in ring):
                num_heterocycles += 1
        return np.array([num_heterocycles])

    def feature_base_labels(self) -> List[str]:
        return ["num_aliphatic_heterocycles"]

NumAromaticRings

Bases: BaseFeatureCalculator

Counts the number of aromatic rings in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class NumAromaticRings(BaseFeatureCalculator):
    """
    Counts the number of aromatic rings in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        return np.array(
            [
                sum(
                    1
                    for ring in mol.GetRingInfo().AtomRings()
                    if all(mol.GetAtomWithIdx(i).GetIsAromatic() for i in ring)
                )
            ]
        )

    def feature_base_labels(self) -> List[str]:
        return ["num_aromatic_rings"]

NumAtoms

Bases: BaseFeatureCalculator

Counts the number of atoms in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class NumAtoms(BaseFeatureCalculator):
    """
    Counts the number of atoms in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = False) -> np.ndarray:
        return np.array([mol.GetNumAtoms()])

    def feature_base_labels(self) -> List[str]:
        return ["num_atoms"]

NumHBondAcceptors

Bases: GenericScalarFeaturizer

Counts Number of hydrogen bond acceptors.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class NumHBondAcceptors(GenericScalarFeaturizer):
    """
    Counts Number of hydrogen bond acceptors.
    """

    def __init__(self, agg: List[str] = None):
        super().__init__(Descriptors.NumHAcceptors, "num_hbond_acceptors", agg=agg)

NumHBondDonors

Bases: GenericScalarFeaturizer

Counts Number of hydrogen bond donors.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class NumHBondDonors(GenericScalarFeaturizer):
    """
    Counts Number of hydrogen bond donors.
    """

    def __init__(self, agg: List[str] = None):
        super().__init__(Descriptors.NumHDonors, "num_hbond_donors", agg=agg)

NumNonAromaticRings

Bases: BaseFeatureCalculator

Counts the number of non-aromatic rings in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class NumNonAromaticRings(BaseFeatureCalculator):
    """
    Counts the number of non-aromatic rings in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        return np.array(
            [
                sum(
                    1
                    for ring in mol.GetRingInfo().AtomRings()
                    if not all(mol.GetAtomWithIdx(i).GetIsAromatic() for i in ring)
                )
            ]
        )

    def feature_base_labels(self) -> List[str]:
        return ["num_non_aromatic_rings"]

NumRings

Bases: BaseFeatureCalculator

Counts the number of rings in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class NumRings(BaseFeatureCalculator):
    """
    Counts the number of rings in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        return np.array([len(mol.GetRingInfo().AtomRings())])

    def feature_base_labels(self) -> List[str]:
        return ["num_rings"]

NumRotatableBonds

Bases: GenericScalarFeaturizer

Counts Number of rotatable bonds.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class NumRotatableBonds(GenericScalarFeaturizer):
    """
    Counts Number of rotatable bonds.
    """

    def __init__(self, agg: List[str] = None):
        super().__init__(Descriptors.NumRotatableBonds, "num_rotatable_bonds", agg=agg)

SlogPVSA1

Bases: GenericScalarFeaturizer

Calculates the Surface area contributing to octanol solubility, linked to lipophilicity.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class SlogPVSA1(GenericScalarFeaturizer):
    """
    Calculates the Surface area contributing to octanol solubility, linked to lipophilicity.
    """

    def __init__(self, agg: List[str] = None):
        super().__init__(Descriptors.SlogP_VSA1, "slogp_vsa1", agg=agg)

SmrVSA5

Bases: GenericScalarFeaturizer

Molar refractivity sum for atoms with specific surface area (2.45–2.75).

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class SmrVSA5(GenericScalarFeaturizer):
    """
    Molar refractivity sum for atoms with specific surface area (2.45–2.75).
    """

    def __init__(self, agg: List[str] = None):
        super().__init__(Descriptors.SMR_VSA5, "smr_vsa5", agg=agg)

Sp2CarbonCountFeaturizer

Bases: BaseFeatureCalculator

Counts the number of sp2 hybridized carbon atoms in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class Sp2CarbonCountFeaturizer(BaseFeatureCalculator):
    """
    Counts the number of sp2 hybridized carbon atoms in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        sp2_count = sum(
            1
            for atom in mol.GetAtoms()
            if atom.GetHybridization() == Chem.HybridizationType.SP2
        )
        return np.array([sp2_count])

    def feature_base_labels(self) -> List[str]:
        return ["sp2_carbon_count"]

Sp3CarbonCountFeaturizer

Bases: BaseFeatureCalculator

Counts the number of sp3 hybridized carbon atoms in the molecule.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class Sp3CarbonCountFeaturizer(BaseFeatureCalculator):
    """
    Counts the number of sp3 hybridized carbon atoms in the molecule.
    """

    def calculate(self, mol: Chem.Mol, sanitize: bool = True) -> np.ndarray:
        self._sanitize(mol, sanitize)
        sp3_count = sum(
            1
            for atom in mol.GetAtoms()
            if atom.GetHybridization() == Chem.HybridizationType.SP3
        )
        return np.array([sp3_count])

    def feature_base_labels(self) -> List[str]:
        return ["sp3_carbon_count"]

TopologicalSurfaceArea

Bases: GenericScalarFeaturizer

Calculates the topological polar surface area.

Source code in src/polymetrix/featurizers/chemical_featurizer.py
class TopologicalSurfaceArea(GenericScalarFeaturizer):
    """
    Calculates the topological polar surface area.
    """

    def __init__(self, agg: List[str] = None):
        super().__init__(Descriptors.TPSA, "topological_surface_area", agg=agg)

comparator

PolymerMoleculeComparator

Comparator that computes various comparison metrics between polymer and molecule features.

Source code in src/polymetrix/featurizers/comparator.py
class PolymerMoleculeComparator:
    """Comparator that computes various comparison metrics between polymer and molecule features."""

    comparators: ClassVar = {
        "absolute_difference": lambda p, m: np.abs(p - m),
        "signed_difference": lambda p, m: p - m,
        "product": lambda p, m: p * m,
        "squared_distance": lambda p, m: (p - m) ** 2,
        "euclidean_distance": lambda p, m: np.sqrt((p - m) ** 2),
    }

    agg_funcs: ClassVar = {
        "mean": np.mean,
        "min": np.min,
        "max": np.max,
        "sum": np.sum,
    }

    def __init__(
        self, polymer_featurizer, molecule_featurizer, comparisons=None, agg=None
    ):
        self.polymer_featurizer = polymer_featurizer
        self.molecule_featurizer = molecule_featurizer
        self.comparisons = (
            comparisons if comparisons is not None else ["absolute_difference"]
        )
        self.agg = agg if agg is not None else []

    def _concatenate_results(self, results):
        return np.concatenate(results) if results else np.array([])

    def compare(self, polymer, molecule):
        """Return comparison metrics between polymer and molecule features."""
        polymer_features = self.polymer_featurizer.featurize(polymer).flatten()
        molecule_features = self.molecule_featurizer.featurize(molecule).flatten()

        results = []
        for comp_func in self.comparisons:
            if comp_func not in self.comparators:
                raise ValueError(f"Unknown comparison function: {comp_func}")
            comparison_result = self.comparators[comp_func](
                polymer_features, molecule_features
            )
            results.append(comparison_result)

        if self.agg:
            aggregated_results = self.aggregate(results)
            results.append(aggregated_results)

        return self._concatenate_results(results)

    def aggregate(self, features):
        """Aggregate features across comparison methods."""
        results = []
        for agg_func in self.agg:
            if agg_func not in self.agg_funcs:
                raise ValueError(f"Unknown aggregation function: {agg_func}")
            aggregated = self.agg_funcs[agg_func](features, axis=0)
            results.append(aggregated)

        return self._concatenate_results(results)

    def _generate_labels(self, base_labels, suffix):
        return [f"{label}_{suffix}" for label in base_labels]

    def feature_labels(self):
        """Generate labels for comparison and aggregated features."""
        base_labels = self.polymer_featurizer.feature_labels()
        labels = []

        # Labels for comparison functions
        for comp_func in self.comparisons:
            labels.extend(self._generate_labels(base_labels, comp_func))

        # Labels for aggregated results
        if self.agg:
            comparison_methods_str = "_".join(self.comparisons)
            for agg_func in self.agg:
                labels.extend(
                    self._generate_labels(
                        base_labels, f"{comparison_methods_str}_{agg_func}"
                    )
                )

        return labels
aggregate(features)

Aggregate features across comparison methods.

Source code in src/polymetrix/featurizers/comparator.py
def aggregate(self, features):
    """Aggregate features across comparison methods."""
    results = []
    for agg_func in self.agg:
        if agg_func not in self.agg_funcs:
            raise ValueError(f"Unknown aggregation function: {agg_func}")
        aggregated = self.agg_funcs[agg_func](features, axis=0)
        results.append(aggregated)

    return self._concatenate_results(results)
compare(polymer, molecule)

Return comparison metrics between polymer and molecule features.

Source code in src/polymetrix/featurizers/comparator.py
def compare(self, polymer, molecule):
    """Return comparison metrics between polymer and molecule features."""
    polymer_features = self.polymer_featurizer.featurize(polymer).flatten()
    molecule_features = self.molecule_featurizer.featurize(molecule).flatten()

    results = []
    for comp_func in self.comparisons:
        if comp_func not in self.comparators:
            raise ValueError(f"Unknown comparison function: {comp_func}")
        comparison_result = self.comparators[comp_func](
            polymer_features, molecule_features
        )
        results.append(comparison_result)

    if self.agg:
        aggregated_results = self.aggregate(results)
        results.append(aggregated_results)

    return self._concatenate_results(results)
feature_labels()

Generate labels for comparison and aggregated features.

Source code in src/polymetrix/featurizers/comparator.py
def feature_labels(self):
    """Generate labels for comparison and aggregated features."""
    base_labels = self.polymer_featurizer.feature_labels()
    labels = []

    # Labels for comparison functions
    for comp_func in self.comparisons:
        labels.extend(self._generate_labels(base_labels, comp_func))

    # Labels for aggregated results
    if self.agg:
        comparison_methods_str = "_".join(self.comparisons)
        for agg_func in self.agg:
            labels.extend(
                self._generate_labels(
                    base_labels, f"{comparison_methods_str}_{agg_func}"
                )
            )

    return labels

molecule

FullMolecularFeaturizer

Bases: MoleculeFeaturizer

Featurizer for general molecules.

This class can featurize any molecule from a Molecule object that contains a SMILES string and RDKit molecule object.

Source code in src/polymetrix/featurizers/molecule.py
class FullMolecularFeaturizer(MoleculeFeaturizer):
    """Featurizer for general molecules.

    This class can featurize any molecule from a Molecule object that contains
    a SMILES string and RDKit molecule object.
    """

    def featurize(self, molecule) -> np.ndarray:
        """Featurize a molecule object.

        Args:
            molecule: A Molecule object with a mol property containing an RDKit molecule.

        Returns:
            np.ndarray: Feature vector calculated by the underlying calculator.
        """
        if molecule.mol is None:
            raise ValueError("Molecule object does not contain a valid RDKit molecule")
        return self.calculator.calculate(molecule.mol)
featurize(molecule)

Featurize a molecule object.

Parameters:

Name Type Description Default
molecule

A Molecule object with a mol property containing an RDKit molecule.

required

Returns:

Type Description
ndarray

np.ndarray: Feature vector calculated by the underlying calculator.

Source code in src/polymetrix/featurizers/molecule.py
def featurize(self, molecule) -> np.ndarray:
    """Featurize a molecule object.

    Args:
        molecule: A Molecule object with a mol property containing an RDKit molecule.

    Returns:
        np.ndarray: Feature vector calculated by the underlying calculator.
    """
    if molecule.mol is None:
        raise ValueError("Molecule object does not contain a valid RDKit molecule")
    return self.calculator.calculate(molecule.mol)

Molecule

A class to represent a general molecule from SMILES string.

Attributes:

Name Type Description
smiles Optional[str]

Optional[str], the SMILES string representing the molecule.

mol Optional[Mol]

Optional[Chem.Mol], the RDKit molecule object.

Raises:

Type Description
ValueError

If the provided SMILES string is invalid or cannot be processed.

Source code in src/polymetrix/featurizers/molecule.py
class Molecule:
    """A class to represent a general molecule from SMILES string.

    Attributes:
        smiles: Optional[str], the SMILES string representing the molecule.
        mol: Optional[Chem.Mol], the RDKit molecule object.

    Raises:
        ValueError: If the provided SMILES string is invalid or cannot be processed.
    """

    def __init__(self):
        self._smiles: Optional[str] = None
        self._mol: Optional[Chem.Mol] = None

    @classmethod
    def from_smiles(cls, smiles: str) -> "Molecule":
        """Creates a Molecule instance from a SMILES string.

        Args:
            smiles: str, the SMILES string representing the molecule.

        Returns:
            Molecule: A new Molecule object initialized with the given SMILES string.

        Raises:
            ValueError: If the SMILES string is invalid.
        """
        molecule = cls()
        molecule.smiles = smiles
        return molecule

    @property
    def smiles(self) -> Optional[str]:
        """Gets the SMILES string of the molecule.

        Returns:
            Optional[str]: The SMILES string, or None if not set.
        """
        return self._smiles

    @smiles.setter
    def smiles(self, value: str):
        """Sets the SMILES string and creates the RDKit molecule object.

        Args:
            value: str, the SMILES string to set.

        Raises:
            ValueError: If the SMILES string is invalid, cannot be processed, or contains polymer connection points (*).
        """
        try:
            # Check for asterisk atoms which indicate pSMILES (polymer SMILES)
            if "*" in value:
                raise ValueError(
                    "SMILES string contains asterisk atoms (*) which indicates a pSMILES string. Use Polymer.from_psmiles() instead for polymer molecules."
                )

            mol = Chem.MolFromSmiles(value)
            if mol is None:
                raise ValueError("Invalid SMILES string")
            self._smiles = value
            self._mol = mol
        except Exception as e:
            raise ValueError(f"Error processing SMILES: {str(e)}") from e

    @property
    def mol(self) -> Optional[Chem.Mol]:
        """Gets the RDKit molecule object.

        Returns:
            Optional[Chem.Mol]: The RDKit molecule object, or None if not set.
        """
        return self._mol

    def calculate_molecular_weight(self) -> float:
        """Calculates the exact molecular weight of the molecule.

        Returns:
            float: The molecular weight of the molecule.
        """
        if self._mol is None:
            raise ValueError("No molecule set")
        return ExactMolWt(self._mol)
mol property

Gets the RDKit molecule object.

Returns:

Type Description
Optional[Mol]

Optional[Chem.Mol]: The RDKit molecule object, or None if not set.

smiles property writable

Gets the SMILES string of the molecule.

Returns:

Type Description
Optional[str]

Optional[str]: The SMILES string, or None if not set.

calculate_molecular_weight()

Calculates the exact molecular weight of the molecule.

Returns:

Name Type Description
float float

The molecular weight of the molecule.

Source code in src/polymetrix/featurizers/molecule.py
def calculate_molecular_weight(self) -> float:
    """Calculates the exact molecular weight of the molecule.

    Returns:
        float: The molecular weight of the molecule.
    """
    if self._mol is None:
        raise ValueError("No molecule set")
    return ExactMolWt(self._mol)
from_smiles(smiles) classmethod

Creates a Molecule instance from a SMILES string.

Parameters:

Name Type Description Default
smiles str

str, the SMILES string representing the molecule.

required

Returns:

Name Type Description
Molecule Molecule

A new Molecule object initialized with the given SMILES string.

Raises:

Type Description
ValueError

If the SMILES string is invalid.

Source code in src/polymetrix/featurizers/molecule.py
@classmethod
def from_smiles(cls, smiles: str) -> "Molecule":
    """Creates a Molecule instance from a SMILES string.

    Args:
        smiles: str, the SMILES string representing the molecule.

    Returns:
        Molecule: A new Molecule object initialized with the given SMILES string.

    Raises:
        ValueError: If the SMILES string is invalid.
    """
    molecule = cls()
    molecule.smiles = smiles
    return molecule

multiple_featurizer

MultipleFeaturizer

Source code in src/polymetrix/featurizers/multiple_featurizer.py
class MultipleFeaturizer:
    def __init__(self, featurizers: List[PolymerPartFeaturizer]):
        self.featurizers = featurizers
        self._last_polymer = None

    def featurize(self, polymer) -> np.ndarray:
        self._last_polymer = polymer  # Store for label generation
        features = []
        for featurizer in self.featurizers:
            feature = featurizer.featurize(polymer)
            if isinstance(feature, (int, float)):
                feature = np.array([feature])
            features.append(feature.flatten())
        return np.concatenate(features)

    def feature_labels(self) -> List[str]:
        """Return feature labels with '_with_terminalgroups' suffix when applicable."""
        labels = []
        for featurizer in self.featurizers:
            featurizer_labels = featurizer.feature_labels()
            labels.extend(featurizer_labels)

        # Add terminal groups suffix if last polymer had terminal groups
        if (
            hasattr(self, "_last_polymer")
            and self._last_polymer
            and (
                (
                    hasattr(self._last_polymer, "backbone_terminal_groups")
                    and self._last_polymer.backbone_terminal_groups
                )
                or (
                    hasattr(self._last_polymer, "sidechain_terminal_groups")
                    and self._last_polymer.sidechain_terminal_groups
                )
            )
        ):
            labels = [
                label.replace(
                    "_backbonefeaturizer", "_with_terminalgroups_backbonefeaturizer"
                )
                .replace(
                    "_sidechainfeaturizer", "_with_terminalgroups_sidechainfeaturizer"
                )
                .replace(
                    "_fullpolymerfeaturizer",
                    "_with_terminalgroups_fullpolymerfeaturizer",
                )
                for label in labels
            ]

        return labels

    def citations(self) -> List[str]:
        citations = []
        for featurizer in self.featurizers:
            if hasattr(featurizer, "calculator") and featurizer.calculator:
                citations.extend(featurizer.calculator.citations())
        return list(set(citations))

    def implementors(self) -> List[str]:
        implementors = []
        for featurizer in self.featurizers:
            if hasattr(featurizer, "calculator") and featurizer.calculator:
                implementors.extend(featurizer.calculator.implementors())
        return list(set(implementors))
feature_labels()

Return feature labels with '_with_terminalgroups' suffix when applicable.

Source code in src/polymetrix/featurizers/multiple_featurizer.py
def feature_labels(self) -> List[str]:
    """Return feature labels with '_with_terminalgroups' suffix when applicable."""
    labels = []
    for featurizer in self.featurizers:
        featurizer_labels = featurizer.feature_labels()
        labels.extend(featurizer_labels)

    # Add terminal groups suffix if last polymer had terminal groups
    if (
        hasattr(self, "_last_polymer")
        and self._last_polymer
        and (
            (
                hasattr(self._last_polymer, "backbone_terminal_groups")
                and self._last_polymer.backbone_terminal_groups
            )
            or (
                hasattr(self._last_polymer, "sidechain_terminal_groups")
                and self._last_polymer.sidechain_terminal_groups
            )
        )
    ):
        labels = [
            label.replace(
                "_backbonefeaturizer", "_with_terminalgroups_backbonefeaturizer"
            )
            .replace(
                "_sidechainfeaturizer", "_with_terminalgroups_sidechainfeaturizer"
            )
            .replace(
                "_fullpolymerfeaturizer",
                "_with_terminalgroups_fullpolymerfeaturizer",
            )
            for label in labels
        ]

    return labels

polymer

Polymer

Represents a polymer molecule with its backbone and sidechain information.

Attributes:

Name Type Description
psmiles Optional[str]

Optional[str], the pSMILES string of the polymer.

backbone_terminal_groups Optional[Dict[str, str]]

Optional[Dict[str, str]], maps connection point patterns to backbone terminal group SMILES.

sidechain_terminal_groups Optional[Dict[str, str]]

Optional[Dict[str, str]], maps connection point patterns to sidechain terminal group SMILES.

graph

Optional[nx.Graph], the NetworkX graph of the polymer structure.

backbone_nodes

Optional[List[int]], node indices forming the backbone.

sidechain_nodes

Optional[List[int]], node indices forming the sidechains.

connection_points

Optional[List[int]], node indices of connection points.

_mol

Optional[Chem.Mol], the RDKit molecule object (internal use).

Source code in src/polymetrix/featurizers/polymer.py
class Polymer:
    """Represents a polymer molecule with its backbone and sidechain information.

    Attributes:
        psmiles: Optional[str], the pSMILES string of the polymer.
        backbone_terminal_groups: Optional[Dict[str, str]], maps connection point patterns to backbone terminal group SMILES.
        sidechain_terminal_groups: Optional[Dict[str, str]], maps connection point patterns to sidechain terminal group SMILES.
        graph: Optional[nx.Graph], the NetworkX graph of the polymer structure.
        backbone_nodes: Optional[List[int]], node indices forming the backbone.
        sidechain_nodes: Optional[List[int]], node indices forming the sidechains.
        connection_points: Optional[List[int]], node indices of connection points.
        _mol: Optional[Chem.Mol], the RDKit molecule object (internal use).
    """

    def __init__(self):
        self._psmiles = None
        self._backbone_terminal_groups = None
        self._sidechain_terminal_groups = None
        self.graph = None
        self.backbone_nodes = None
        self.sidechain_nodes = None
        self.connection_points = None
        self._mol = None

    @property
    def mol(self) -> Optional[Chem.Mol]:
        """Returns the full polymer molecule, compatible with featurizers expecting a 'mol' attribute."""
        return self.full_polymer_mol

    @classmethod
    def from_psmiles(cls, psmiles: str) -> "Polymer":
        """Creates a Polymer instance from a pSMILES string.

        Args:
            psmiles: The pSMILES string representing the polymer.

        Returns:
            A new Polymer instance.

        Raises:
            ValueError: If the pSMILES string is invalid.
        """
        polymer = cls()
        polymer.psmiles = psmiles
        return polymer

    @property
    def psmiles(self) -> Optional[str]:
        """The pSMILES string of the polymer."""
        return self._psmiles

    @psmiles.setter
    def psmiles(self, value: str):
        """Sets the pSMILES string and updates the polymer's structure.

        Args:
            value: The pSMILES string to set.

        Raises:
            ValueError: If the pSMILES string is None, empty, or invalid.
        """
        if not value or not isinstance(value, str):
            raise ValueError("pSMILES cannot be None or empty")
        try:
            mol = Chem.MolFromSmiles(value)
            if mol is None:
                raise ValueError("Invalid pSMILES string")
            self._psmiles = value
            self._mol = mol
            self.graph = self._mol_to_nx(mol)
            self._identify_connection_points()
            self._identify_backbone_and_sidechain()
        except Exception as e:
            raise ValueError(f"Error processing pSMILES: {str(e)}") from e

    @property
    def backbone_terminal_groups(self) -> Optional[Dict[str, str]]:
        """Maps connection point patterns to backbone terminal group SMILES."""
        return self._backbone_terminal_groups

    @backbone_terminal_groups.setter
    def backbone_terminal_groups(self, value: Dict[str, str]):
        """Sets terminal groups for backbone connection points."""
        self._backbone_terminal_groups = value

    @property
    def sidechain_terminal_groups(self) -> Optional[Dict[str, str]]:
        """Maps connection point patterns to sidechain terminal group SMILES."""
        return self._sidechain_terminal_groups

    @sidechain_terminal_groups.setter
    def sidechain_terminal_groups(self, value: Dict[str, str]):
        """Sets terminal groups for sidechain connection points."""
        self._sidechain_terminal_groups = value

    @staticmethod
    def _mol_to_nx(mol: Chem.Mol) -> nx.Graph:
        """Converts an RDKit molecule to a NetworkX graph.

        Args:
            mol: The RDKit molecule to convert.

        Returns:
            A NetworkX graph representing the molecule's structure.
        """
        G = nx.Graph()
        for atom in mol.GetAtoms():
            G.add_node(
                atom.GetIdx(),
                atomic_num=atom.GetAtomicNum(),
                element=atom.GetSymbol(),
                formal_charge=atom.GetFormalCharge(),
                is_aromatic=atom.GetIsAromatic(),
            )
        for bond in mol.GetBonds():
            G.add_edge(
                bond.GetBeginAtomIdx(),
                bond.GetEndAtomIdx(),
                bond_type=bond.GetBondType(),
                is_aromatic=bond.GetIsAromatic(),
            )
        return G

    def _identify_connection_points(self):
        """Identifies connection points (asterisk atoms) in the polymer graph."""
        self.connection_points = [
            node for node, data in self.graph.nodes(data=True) if data["element"] == "*"
        ]

    def _identify_backbone_and_sidechain(self):
        """Classifies nodes into backbone and sidechain components."""
        self.backbone_nodes, self.sidechain_nodes = classify_backbone_and_sidechains(
            self.graph
        )

    @property
    def backbone_molecule(self) -> Chem.Mol:
        """Gets the backbone molecule."""
        return self._get_backbone_molecule(include_terminal_groups=True)

    def _get_backbone_molecule(self, include_terminal_groups: bool = True) -> Chem.Mol:
        """Internal method to get backbone molecule with optional terminal groups."""
        backbone_mol = self._extract_substructure_mol(self.backbone_nodes)
        if include_terminal_groups and self._backbone_terminal_groups:
            backbone_mol = insert_terminal_group(
                backbone_mol, self._backbone_terminal_groups, is_sidechain=False
            )
        return backbone_mol

    @property
    def sidechain_molecules(self) -> List[Chem.Mol]:
        """Gets the sidechain molecules."""
        return self._get_sidechain_molecules(include_terminal_groups=True)

    def _get_sidechain_molecules(
        self, include_terminal_groups: bool = True
    ) -> List[Chem.Mol]:
        """Internal method to get sidechain molecules with optional terminal groups."""
        sidechain_components = list(
            nx.connected_components(self.graph.subgraph(self.sidechain_nodes))
        )
        sidechain_mols = []
        for component_nodes in sidechain_components:
            mol = self._extract_substructure_mol(list(component_nodes))
            if include_terminal_groups and self._sidechain_terminal_groups:
                mol = insert_terminal_group(
                    mol, self._sidechain_terminal_groups, is_sidechain=True
                )
            sidechain_mols.append(mol)
        return sidechain_mols

    @property
    def full_polymer_mol(self) -> Chem.Mol:
        """Gets the full polymer molecule."""
        return self._get_full_polymer_mol(include_terminal_groups=True)

    def _get_full_polymer_mol(self, include_terminal_groups: bool = True) -> Chem.Mol:
        """Internal method to get full polymer molecule with optional terminal groups."""
        if include_terminal_groups and self._backbone_terminal_groups:
            return insert_terminal_group(
                self._mol, self._backbone_terminal_groups, is_sidechain=False
            )
        return self._mol

    def _extract_substructure_mol(self, node_indices: List[int]) -> Chem.Mol:
        """Extracts a substructure molecule from the main molecule using node indices."""
        if not node_indices:
            return Chem.MolFromSmiles("")
        mol = RWMol()
        old_to_new_idx = {}
        for old_idx in node_indices:
            atom = self._mol.GetAtomWithIdx(old_idx)
            new_atom = Atom(atom.GetAtomicNum())
            new_atom.SetFormalCharge(atom.GetFormalCharge())
            if atom.GetIsAromatic():
                new_atom.SetIsAromatic(True)
            new_idx = mol.AddAtom(new_atom)
            old_to_new_idx[old_idx] = new_idx
        for bond in self._mol.GetBonds():
            begin_idx = bond.GetBeginAtomIdx()
            end_idx = bond.GetEndAtomIdx()
            if begin_idx in old_to_new_idx and end_idx in old_to_new_idx:
                mol.AddBond(
                    old_to_new_idx[begin_idx],
                    old_to_new_idx[end_idx],
                    bond.GetBondType(),
                )
        return mol.GetMol()

    def get_backbone_and_sidechain_molecules(
        self,
    ) -> Tuple[List[Chem.Mol], List[Chem.Mol]]:
        """Extracts RDKit molecules for the backbone and sidechains.

        Returns:
            A tuple of (list of backbone molecules, list of sidechain molecules).
        """
        return [self.backbone_molecule], self.sidechain_molecules

    def get_backbone_and_sidechain_graphs(self) -> Tuple[nx.Graph, List[nx.Graph]]:
        """Extracts NetworkX graphs for the backbone and sidechains.

        Returns:
            A tuple of (backbone graph, list of sidechain graphs).
        """
        backbone_graph = self.graph.subgraph(self.backbone_nodes)
        sidechain_graphs = [
            self.graph.subgraph(nodes)
            for nodes in nx.connected_components(
                self.graph.subgraph(self.sidechain_nodes)
            )
        ]
        return backbone_graph, sidechain_graphs

    def calculate_molecular_weight(self) -> float:
        """Calculates the exact molecular weight of the polymer.

        Returns:
            The molecular weight of the polymer.
        """
        return ExactMolWt(self._mol) if self._mol else 0.0

    def get_connection_points(self) -> List[int]:
        """Gets the connection point node indices.

        Returns:
            List of node indices representing connection points.
        """
        return self.connection_points
backbone_molecule property

Gets the backbone molecule.

backbone_terminal_groups property writable

Maps connection point patterns to backbone terminal group SMILES.

full_polymer_mol property

Gets the full polymer molecule.

mol property

Returns the full polymer molecule, compatible with featurizers expecting a 'mol' attribute.

psmiles property writable

The pSMILES string of the polymer.

sidechain_molecules property

Gets the sidechain molecules.

sidechain_terminal_groups property writable

Maps connection point patterns to sidechain terminal group SMILES.

calculate_molecular_weight()

Calculates the exact molecular weight of the polymer.

Returns:

Type Description
float

The molecular weight of the polymer.

Source code in src/polymetrix/featurizers/polymer.py
def calculate_molecular_weight(self) -> float:
    """Calculates the exact molecular weight of the polymer.

    Returns:
        The molecular weight of the polymer.
    """
    return ExactMolWt(self._mol) if self._mol else 0.0
from_psmiles(psmiles) classmethod

Creates a Polymer instance from a pSMILES string.

Parameters:

Name Type Description Default
psmiles str

The pSMILES string representing the polymer.

required

Returns:

Type Description
Polymer

A new Polymer instance.

Raises:

Type Description
ValueError

If the pSMILES string is invalid.

Source code in src/polymetrix/featurizers/polymer.py
@classmethod
def from_psmiles(cls, psmiles: str) -> "Polymer":
    """Creates a Polymer instance from a pSMILES string.

    Args:
        psmiles: The pSMILES string representing the polymer.

    Returns:
        A new Polymer instance.

    Raises:
        ValueError: If the pSMILES string is invalid.
    """
    polymer = cls()
    polymer.psmiles = psmiles
    return polymer
get_backbone_and_sidechain_graphs()

Extracts NetworkX graphs for the backbone and sidechains.

Returns:

Type Description
Tuple[Graph, List[Graph]]

A tuple of (backbone graph, list of sidechain graphs).

Source code in src/polymetrix/featurizers/polymer.py
def get_backbone_and_sidechain_graphs(self) -> Tuple[nx.Graph, List[nx.Graph]]:
    """Extracts NetworkX graphs for the backbone and sidechains.

    Returns:
        A tuple of (backbone graph, list of sidechain graphs).
    """
    backbone_graph = self.graph.subgraph(self.backbone_nodes)
    sidechain_graphs = [
        self.graph.subgraph(nodes)
        for nodes in nx.connected_components(
            self.graph.subgraph(self.sidechain_nodes)
        )
    ]
    return backbone_graph, sidechain_graphs
get_backbone_and_sidechain_molecules()

Extracts RDKit molecules for the backbone and sidechains.

Returns:

Type Description
Tuple[List[Mol], List[Mol]]

A tuple of (list of backbone molecules, list of sidechain molecules).

Source code in src/polymetrix/featurizers/polymer.py
def get_backbone_and_sidechain_molecules(
    self,
) -> Tuple[List[Chem.Mol], List[Chem.Mol]]:
    """Extracts RDKit molecules for the backbone and sidechains.

    Returns:
        A tuple of (list of backbone molecules, list of sidechain molecules).
    """
    return [self.backbone_molecule], self.sidechain_molecules
get_connection_points()

Gets the connection point node indices.

Returns:

Type Description
List[int]

List of node indices representing connection points.

Source code in src/polymetrix/featurizers/polymer.py
def get_connection_points(self) -> List[int]:
    """Gets the connection point node indices.

    Returns:
        List of node indices representing connection points.
    """
    return self.connection_points

add_degree_one_nodes_to_backbone(graph, backbone)

Adds degree-1 nodes connected to backbone nodes to the backbone list, avoiding duplicates.

Parameters:

Name Type Description Default
graph Graph

The input graph to analyze.

required
backbone List[int]

Initial list of backbone node indices.

required

Returns:

Type Description
List[int]

Updated backbone list including degree-1 nodes, with no duplicates.

Source code in src/polymetrix/featurizers/polymer.py
def add_degree_one_nodes_to_backbone(graph: nx.Graph, backbone: List[int]) -> List[int]:
    """Adds degree-1 nodes connected to backbone nodes to the backbone list, avoiding duplicates.

    Args:
        graph: The input graph to analyze.
        backbone: Initial list of backbone node indices.

    Returns:
        Updated backbone list including degree-1 nodes, with no duplicates.
    """
    for node in graph.nodes:
        if graph.degree[node] == 1 and node not in backbone:
            neighbor = next(iter(graph.neighbors(node)))
            if neighbor in backbone:
                backbone.append(node)
    return backbone

attach_terminal_to_atom(mol, target_idx, terminal_mol, attachment_idx=None)

Attaches a terminal group to a specific atom in the molecule.

Parameters:

Name Type Description Default
mol RWMol

The molecule being modified.

required
target_idx int

Index of the target atom to attach the terminal group.

required
terminal_mol Mol

The terminal group molecule.

required
attachment_idx int

Index of the attachment point in the terminal group (optional for sidechains).

None

Returns:

Type Description
RWMol

The modified molecule.

Source code in src/polymetrix/featurizers/polymer.py
def attach_terminal_to_atom(
    mol: RWMol,
    target_idx: int,
    terminal_mol: Chem.Mol,
    attachment_idx: int = None,
) -> RWMol:
    """Attaches a terminal group to a specific atom in the molecule.

    Args:
        mol: The molecule being modified.
        target_idx: Index of the target atom to attach the terminal group.
        terminal_mol: The terminal group molecule.
        attachment_idx: Index of the attachment point in the terminal group (optional for sidechains).

    Returns:
        The modified molecule.
    """
    atom_mapping = {}
    for atom in terminal_mol.GetAtoms():
        new_atom = Atom(atom.GetAtomicNum())
        new_atom.SetFormalCharge(atom.GetFormalCharge())
        new_idx = mol.AddAtom(new_atom)
        atom_mapping[atom.GetIdx()] = new_idx
    for bond in terminal_mol.GetBonds():
        begin_idx = bond.GetBeginAtomIdx()
        end_idx = bond.GetEndAtomIdx()
        if begin_idx in atom_mapping and end_idx in atom_mapping:
            mol.AddBond(
                atom_mapping[begin_idx], atom_mapping[end_idx], bond.GetBondType()
            )
    first_terminal_atom_idx = next(iter(atom_mapping.keys()))
    mol.AddBond(target_idx, atom_mapping[first_terminal_atom_idx], Chem.BondType.SINGLE)
    return mol

classify_backbone_and_sidechains(graph)

Classifies nodes into backbone and sidechain components based on paths and cycles.

Parameters:

Name Type Description Default
graph Graph

The input graph to classify.

required

Returns:

Type Description
Tuple[List[int], List[int]]

A tuple of (backbone nodes, sidechain nodes).

Source code in src/polymetrix/featurizers/polymer.py
def classify_backbone_and_sidechains(graph: nx.Graph) -> Tuple[List[int], List[int]]:
    """Classifies nodes into backbone and sidechain components based on paths and cycles.

    Args:
        graph: The input graph to classify.

    Returns:
        A tuple of (backbone nodes, sidechain nodes).
    """
    shortest_paths = find_shortest_paths_between_stars(graph)
    cycles = find_cycles_including_paths(graph, shortest_paths)
    backbone_nodes = set()
    for cycle in cycles:
        backbone_nodes.update(cycle)
    for path in shortest_paths:
        backbone_nodes.update(path)
    backbone_nodes = add_degree_one_nodes_to_backbone(graph, list(backbone_nodes))
    sidechain_nodes = [node for node in graph.nodes if node not in backbone_nodes]
    return list(backbone_nodes), sidechain_nodes

find_cycles_including_paths(graph, paths)

Identifies cycles that include nodes from the given paths.

Parameters:

Name Type Description Default
graph Graph

The input graph to analyze.

required
paths List[List[int]]

List of paths whose nodes are used to filter cycles.

required

Returns:

Type Description
List[List[int]]

List of cycles, where each cycle is a list of node indices.

Source code in src/polymetrix/featurizers/polymer.py
def find_cycles_including_paths(
    graph: nx.Graph, paths: List[List[int]]
) -> List[List[int]]:
    """Identifies cycles that include nodes from the given paths.

    Args:
        graph: The input graph to analyze.
        paths: List of paths whose nodes are used to filter cycles.

    Returns:
        List of cycles, where each cycle is a list of node indices.
    """
    all_cycles = nx.cycle_basis(graph)
    path_nodes = {node for path in paths for node in path}
    return [cycle for cycle in all_cycles if any(node in path_nodes for node in cycle)]

find_shortest_paths_between_stars(graph)

Finds shortest paths between all pairs of asterisk (*) nodes in the graph.

Parameters:

Name Type Description Default
graph Graph

The input graph to analyze.

required

Returns:

Type Description
List[List[int]]

List of shortest paths, where each path is a list of node indices.

Source code in src/polymetrix/featurizers/polymer.py
def find_shortest_paths_between_stars(graph: nx.Graph) -> List[List[int]]:
    """Finds shortest paths between all pairs of asterisk (*) nodes in the graph.

    Args:
        graph: The input graph to analyze.

    Returns:
        List of shortest paths, where each path is a list of node indices.
    """
    star_nodes = [
        node for node, data in graph.nodes(data=True) if data["element"] == "*"
    ]
    shortest_paths = []
    for i in range(len(star_nodes)):
        for j in range(i + 1, len(star_nodes)):
            try:
                path = nx.shortest_path(
                    graph, source=star_nodes[i], target=star_nodes[j]
                )
                shortest_paths.append(path)
            except nx.NetworkXNoPath:
                continue
    return shortest_paths

insert_terminal_group(mol, terminal_groups, is_sidechain=False)

Inserts terminal groups into a molecule by replacing connection points or attaching to sidechains.

Parameters:

Name Type Description Default
mol Mol

The RDKit molecule to modify.

required
terminal_groups Dict[str, str]

Dictionary mapping patterns to terminal group SMILES.

required
is_sidechain bool

If True, attach terminal groups to sidechains; else, replace backbone connection points.

False

Returns:

Type Description
Mol

A new RDKit molecule with terminal groups inserted.

Source code in src/polymetrix/featurizers/polymer.py
def insert_terminal_group(
    mol: Chem.Mol, terminal_groups: Dict[str, str], is_sidechain: bool = False
) -> Chem.Mol:
    """Inserts terminal groups into a molecule by replacing connection points or attaching to sidechains.

    Args:
        mol: The RDKit molecule to modify.
        terminal_groups: Dictionary mapping patterns to terminal group SMILES.
        is_sidechain: If True, attach terminal groups to sidechains; else, replace backbone connection points.

    Returns:
        A new RDKit molecule with terminal groups inserted.
    """
    if not terminal_groups:
        return mol

    mol_copy = RWMol(mol)

    if is_sidechain:
        for pattern, terminal_smiles in terminal_groups.items():
            terminal_mol = Chem.MolFromSmiles(
                terminal_smiles.replace("*", "")
            )  # Remove asterisk for attachment
            if terminal_mol is None:
                logging.warning(f"Invalid terminal group SMILES '{terminal_smiles}'")
                continue
            target_idx = 0  # Attach to the first atom of the sidechain
            mol_copy = attach_terminal_to_atom(
                mol_copy, target_idx, terminal_mol, attachment_idx=None
            )
    else:
        asterisk_atoms = [
            atom.GetIdx() for atom in mol_copy.GetAtoms() if atom.GetSymbol() == "*"
        ]
        for pattern, terminal_smiles in terminal_groups.items():
            if pattern == "[*]":
                terminal_mol = Chem.MolFromSmiles(terminal_smiles)
                if terminal_mol is None:
                    logging.warning(
                        f"Invalid terminal group SMILES '{terminal_smiles}'"
                    )
                    continue
                attachment_idx = None
                for atom in terminal_mol.GetAtoms():
                    if atom.GetSymbol() == "*":
                        attachment_idx = atom.GetIdx()
                        break
                if attachment_idx is None:
                    logging.warning(
                        f"No attachment point (*) found in terminal group '{terminal_smiles}'"
                    )
                    continue
                for ast_idx in sorted(asterisk_atoms, reverse=True):
                    mol_copy = replace_asterisk_with_terminal(
                        mol_copy, ast_idx, terminal_mol, attachment_idx
                    )

    return mol_copy.GetMol()

replace_asterisk_with_terminal(mol, asterisk_idx, terminal_mol, attachment_idx)

Replaces a single asterisk atom with a terminal group.

Parameters:

Name Type Description Default
mol RWMol

The molecule being modified.

required
asterisk_idx int

Index of the asterisk atom to replace.

required
terminal_mol Mol

The terminal group molecule.

required
attachment_idx int

Index of the attachment point in the terminal group.

required

Returns:

Type Description
RWMol

The modified molecule.

Source code in src/polymetrix/featurizers/polymer.py
def replace_asterisk_with_terminal(
    mol: RWMol, asterisk_idx: int, terminal_mol: Chem.Mol, attachment_idx: int
) -> RWMol:
    """Replaces a single asterisk atom with a terminal group.

    Args:
        mol: The molecule being modified.
        asterisk_idx: Index of the asterisk atom to replace.
        terminal_mol: The terminal group molecule.
        attachment_idx: Index of the attachment point in the terminal group.

    Returns:
        The modified molecule.
    """
    asterisk_atom = mol.GetAtomWithIdx(asterisk_idx)
    neighbors = [n.GetIdx() for n in asterisk_atom.GetNeighbors()]
    if not neighbors:
        for atom in terminal_mol.GetAtoms():
            if atom.GetSymbol() != "*":
                new_atom = Atom(atom.GetAtomicNum())
                new_atom.SetFormalCharge(atom.GetFormalCharge())
                mol.ReplaceAtom(asterisk_idx, new_atom)
                break
        return mol
    neighbor_idx = neighbors[0]
    bond = mol.GetBondBetweenAtoms(asterisk_idx, neighbor_idx)
    bond_type = bond.GetBondType() if bond else Chem.BondType.SINGLE
    mol.RemoveAtom(asterisk_idx)
    if neighbor_idx > asterisk_idx:
        neighbor_idx -= 1
    atom_mapping = {}
    for atom in terminal_mol.GetAtoms():
        if atom.GetIdx() != attachment_idx:
            new_atom = Atom(atom.GetAtomicNum())
            new_atom.SetFormalCharge(atom.GetFormalCharge())
            new_idx = mol.AddAtom(new_atom)
            atom_mapping[atom.GetIdx()] = new_idx
    for bond in terminal_mol.GetBonds():
        begin_idx = bond.GetBeginAtomIdx()
        end_idx = bond.GetEndAtomIdx()
        if begin_idx == attachment_idx or end_idx == attachment_idx:
            continue
        if begin_idx in atom_mapping and end_idx in atom_mapping:
            mol.AddBond(
                atom_mapping[begin_idx], atom_mapping[end_idx], bond.GetBondType()
            )
    for bond in terminal_mol.GetBonds():
        if bond.GetBeginAtomIdx() == attachment_idx:
            connection_atom_idx = bond.GetEndAtomIdx()
        elif bond.GetEndAtomIdx() == attachment_idx:
            connection_atom_idx = bond.GetBeginAtomIdx()
        else:
            continue
        if connection_atom_idx in atom_mapping:
            mol.AddBond(neighbor_idx, atom_mapping[connection_atom_idx], bond_type)
            break
    return mol

sidechain_backbone_featurizer

SidechainDiversityFeaturizer

Bases: BaseFeatureCalculator

Computes the number of structurally diverse sidechains in a polymer based on graph isomorphism.

Source code in src/polymetrix/featurizers/sidechain_backbone_featurizer.py
class SidechainDiversityFeaturizer(BaseFeatureCalculator):
    """Computes the number of structurally diverse sidechains in a polymer based on graph isomorphism."""

    def featurize(self, polymer) -> np.ndarray:
        sidechain_graphs = polymer.get_backbone_and_sidechain_graphs()[1]
        unique_hashes = set()
        for scg in sidechain_graphs:
            graph_hash = nx.weisfeiler_lehman_graph_hash(scg)
            unique_hashes.add(graph_hash)
        return np.array([len(unique_hashes)])

    def feature_labels(self) -> List[str]:
        return ["num_diverse_sidechains"]

SidechainLengthToStarAttachmentDistanceRatioFeaturizer

Bases: BaseFeatureCalculator

Computes aggregated ratios of sidechain lengths to the shortest backbone distance from the polymer's star node (*) to each sidechain's attachment point.

Source code in src/polymetrix/featurizers/sidechain_backbone_featurizer.py
class SidechainLengthToStarAttachmentDistanceRatioFeaturizer(BaseFeatureCalculator):
    """Computes aggregated ratios of sidechain lengths to the shortest backbone distance from the polymer's star node (*) to each sidechain's attachment point."""

    def _compute_min_backbone_length(self, sidechain, star_nodes, star_paths, graph):
        """Calculate the minimum backbone distance from any star node to the sidechain's attachment point."""
        min_backbone_length = float("inf")
        side_nodes = set(sidechain.nodes())
        for node in side_nodes:
            neighbors = set(graph.neighbors(node))
            backbone_neighbors = neighbors - side_nodes
            if backbone_neighbors:
                attachment_point = next(iter(backbone_neighbors))
                for star in star_nodes:
                    if attachment_point in star_paths[star]:
                        path_length = star_paths[star][attachment_point] + 1
                        min_backbone_length = min(min_backbone_length, path_length)
        return min_backbone_length

    def featurize(self, polymer) -> np.ndarray:
        graph = polymer.graph
        star_nodes = [
            node for node, data in graph.nodes(data=True) if data["element"] == "*"
        ]
        backbone_graphs, sidechain_graphs = polymer.get_backbone_and_sidechain_graphs()

        if not sidechain_graphs or not backbone_graphs:
            return np.zeros(len(self.agg))

        sidechain_lengths = [len(sc.nodes()) for sc in sidechain_graphs]
        star_paths = {
            star: nx.single_source_shortest_path_length(graph, star)
            for star in star_nodes
        }

        backbone_lengths = [
            self._compute_min_backbone_length(sidechain, star_nodes, star_paths, graph)
            for sidechain in sidechain_graphs
        ]

        ratios = [
            s_length / b_length
            for s_length, b_length in zip(sidechain_lengths, backbone_lengths)
            if b_length > 0
        ]
        if not ratios:
            return np.zeros(len(self.agg))

        agg_ratios = self.aggregate(ratios)
        return np.array(agg_ratios)

    def feature_base_labels(self) -> List[str]:
        return ["sidechainlength_to_star_attachment_distance_ratio"]

StarToSidechainMinDistanceFeaturizer

Bases: BaseFeatureCalculator

Computes aggregated minimum backbone distances from star nodes (*) to sidechains in a polymer.

Source code in src/polymetrix/featurizers/sidechain_backbone_featurizer.py
class StarToSidechainMinDistanceFeaturizer(BaseFeatureCalculator):
    """Computes aggregated minimum backbone distances from star nodes (*) to sidechains in a polymer."""

    def featurize(self, polymer) -> np.ndarray:
        graph = polymer.graph
        star_nodes = [
            node for node, data in graph.nodes(data=True) if data["element"] == "*"
        ]
        sidechain_graphs = polymer.get_backbone_and_sidechain_graphs()[1]

        distances = []
        for sidechain in sidechain_graphs:
            valid_dists = [
                nx.shortest_path_length(graph, star, node) - 1
                for star in star_nodes
                for node in sidechain.nodes()
                if nx.has_path(graph, star, node)
            ]
            if valid_dists:
                distances.append(min(valid_dists))

        if not distances:
            return np.zeros(len(self.agg))

        return self.aggregate(distances)

    def feature_base_labels(self) -> List[str]:
        return ["star_to_sidechain_min_distance"]

splitters

splitters

PolymerClassSplitter

Bases: BaseSplitter

Splitter based on polymer class

Source code in src/polymetrix/splitters/splitters.py
class PolymerClassSplitter(BaseSplitter):
    """Splitter based on polymer class"""

    def __init__(
        self,
        ds: AbstractDataset,
        column_name: str = "meta.polymer_class",
        shuffle: bool = True,
        random_state: Optional[Union[int, np.random.RandomState]] = None,
        **kwargs,
    ) -> None:
        self._column_name = column_name
        super().__init__(ds=ds, shuffle=shuffle, random_state=random_state, **kwargs)

    def _get_groups(self) -> Collection[str]:
        col_idx = self._ds._meta_names.index(self._column_name)
        metadata = self._ds._meta_data[:, col_idx]
        return metadata.flatten()

TgSplitter

Bases: BaseSplitter

Splitter based on Tg values

Source code in src/polymetrix/splitters/splitters.py
class TgSplitter(BaseSplitter):
    """Splitter based on Tg values"""

    def __init__(
        self,
        ds: AbstractDataset,
        tg_q: Optional[Collection[float]] = None,
        label_name: str = "labels.Exp_Tg(K)",
        shuffle: bool = True,
        random_state: Optional[Union[int, np.random.RandomState]] = None,
        **kwargs,
    ) -> None:
        """Initialize TgSplitter

        Args:
            ds: Dataset to split
            tg_q: Quantiles to bin Tg values into groups
            label_name: Name of the label to use for splitting
            shuffle: Whether to shuffle the dataset
            random_state: Random state for shuffling
            **kwargs: Additional arguments to pass to BaseSplitter
        """
        self._grouping_q = tg_q
        self._label_name = label_name
        super().__init__(ds=ds, shuffle=shuffle, random_state=random_state, **kwargs)

    def _get_groups(self) -> Collection[int]:
        """Bin Tg values into quantile-based groups"""
        tg_values = self._ds.get_labels(
            idx=range(len(self._ds)), label_names=[self._label_name]
        ).flatten()
        return quantile_binning(tg_values, self._grouping_q)
__init__(ds, tg_q=None, label_name='labels.Exp_Tg(K)', shuffle=True, random_state=None, **kwargs)

Initialize TgSplitter

Parameters:

Name Type Description Default
ds AbstractDataset

Dataset to split

required
tg_q Optional[Collection[float]]

Quantiles to bin Tg values into groups

None
label_name str

Name of the label to use for splitting

'labels.Exp_Tg(K)'
shuffle bool

Whether to shuffle the dataset

True
random_state Optional[Union[int, RandomState]]

Random state for shuffling

None
**kwargs

Additional arguments to pass to BaseSplitter

{}
Source code in src/polymetrix/splitters/splitters.py
def __init__(
    self,
    ds: AbstractDataset,
    tg_q: Optional[Collection[float]] = None,
    label_name: str = "labels.Exp_Tg(K)",
    shuffle: bool = True,
    random_state: Optional[Union[int, np.random.RandomState]] = None,
    **kwargs,
) -> None:
    """Initialize TgSplitter

    Args:
        ds: Dataset to split
        tg_q: Quantiles to bin Tg values into groups
        label_name: Name of the label to use for splitting
        shuffle: Whether to shuffle the dataset
        random_state: Random state for shuffling
        **kwargs: Additional arguments to pass to BaseSplitter
    """
    self._grouping_q = tg_q
    self._label_name = label_name
    super().__init__(ds=ds, shuffle=shuffle, random_state=random_state, **kwargs)