Using featurizers
In [1]:
Copied!
%reload_ext autoreload
%autoreload 2
%reload_ext autoreload
%autoreload 2
In [2]:
Copied!
import logging
from typing import List, Optional
import pandas as pd
from polymetrix.featurizers.polymer import Polymer
from polymetrix.featurizers.molecule import Molecule
from polymetrix.featurizers.chemical_featurizer import (
NumHBondDonors,
NumHBondAcceptors,
NumRotatableBonds,
NumRings,
NumNonAromaticRings,
NumAromaticRings,
NumAtoms,
TopologicalSurfaceArea,
FractionBicyclicRings,
NumAliphaticHeterocycles,
SlogPVSA1,
BalabanJIndex,
MolecularWeight,
Sp3CarbonCountFeaturizer,
Sp2CarbonCountFeaturizer,
MaxEStateIndex,
SmrVSA5,
FpDensityMorgan1,
HalogenCounts,
BondCounts,
BridgingRingsCount,
MaxRingSize,
HeteroatomCount,
HeteroatomDensity,
)
from polymetrix.featurizers.sidechain_backbone_featurizer import (
SideChainFeaturizer,
NumSideChainFeaturizer,
BackBoneFeaturizer,
NumBackBoneFeaturizer,
FullPolymerFeaturizer,
SidechainLengthToStarAttachmentDistanceRatioFeaturizer,
StarToSidechainMinDistanceFeaturizer,
SidechainDiversityFeaturizer,
)
from polymetrix.featurizers.molecule import FullMolecularFeaturizer
from polymetrix.featurizers.multiple_featurizer import MultipleFeaturizer
from polymetrix.featurizers.comparator import PolymerMoleculeComparator
import logging
from typing import List, Optional
import pandas as pd
from polymetrix.featurizers.polymer import Polymer
from polymetrix.featurizers.molecule import Molecule
from polymetrix.featurizers.chemical_featurizer import (
NumHBondDonors,
NumHBondAcceptors,
NumRotatableBonds,
NumRings,
NumNonAromaticRings,
NumAromaticRings,
NumAtoms,
TopologicalSurfaceArea,
FractionBicyclicRings,
NumAliphaticHeterocycles,
SlogPVSA1,
BalabanJIndex,
MolecularWeight,
Sp3CarbonCountFeaturizer,
Sp2CarbonCountFeaturizer,
MaxEStateIndex,
SmrVSA5,
FpDensityMorgan1,
HalogenCounts,
BondCounts,
BridgingRingsCount,
MaxRingSize,
HeteroatomCount,
HeteroatomDensity,
)
from polymetrix.featurizers.sidechain_backbone_featurizer import (
SideChainFeaturizer,
NumSideChainFeaturizer,
BackBoneFeaturizer,
NumBackBoneFeaturizer,
FullPolymerFeaturizer,
SidechainLengthToStarAttachmentDistanceRatioFeaturizer,
StarToSidechainMinDistanceFeaturizer,
SidechainDiversityFeaturizer,
)
from polymetrix.featurizers.molecule import FullMolecularFeaturizer
from polymetrix.featurizers.multiple_featurizer import MultipleFeaturizer
from polymetrix.featurizers.comparator import PolymerMoleculeComparator
Full Polymer Featurization¶
In [3]:
Copied!
psmiles_list = [
"c1ccccc1[*]CCO[*]",
"CC[*]CCCC[*]",
]
full_featurizers = [
FullPolymerFeaturizer(NumRings()),
FullPolymerFeaturizer(MolecularWeight()),
FullPolymerFeaturizer(TopologicalSurfaceArea()),
]
full_multi_featurizer = MultipleFeaturizer(full_featurizers)
for psmiles in psmiles_list:
polymer = Polymer.from_psmiles(psmiles)
features = full_multi_featurizer.featurize(polymer)
labels = full_multi_featurizer.feature_labels()
for label, value in zip(labels, features):
print(f"{label}: {value:.2f}")
psmiles_list = [
"c1ccccc1[*]CCO[*]",
"CC[*]CCCC[*]",
]
full_featurizers = [
FullPolymerFeaturizer(NumRings()),
FullPolymerFeaturizer(MolecularWeight()),
FullPolymerFeaturizer(TopologicalSurfaceArea()),
]
full_multi_featurizer = MultipleFeaturizer(full_featurizers)
for psmiles in psmiles_list:
polymer = Polymer.from_psmiles(psmiles)
features = full_multi_featurizer.featurize(polymer)
labels = full_multi_featurizer.feature_labels()
for label, value in zip(labels, features):
print(f"{label}: {value:.2f}")
num_rings_sum_fullpolymerfeaturizer: 1.00 molecular_weight_sum_fullpolymerfeaturizer: 121.07 topological_surface_area_sum_fullpolymerfeaturizer: 9.23 num_rings_sum_fullpolymerfeaturizer: 0.00 molecular_weight_sum_fullpolymerfeaturizer: 85.10 topological_surface_area_sum_fullpolymerfeaturizer: 0.00
Side Chain Featurization¶
In [4]:
Copied!
sidechain_featurizers = [
NumSideChainFeaturizer(),
SideChainFeaturizer(NumAtoms(agg=["sum"])),
SideChainFeaturizer(NumHBondDonors(agg=["sum"])),
SideChainFeaturizer(NumRotatableBonds(agg=["sum"])),
]
sidechain_multi_featurizer = MultipleFeaturizer(sidechain_featurizers)
for psmiles in psmiles_list:
polymer = Polymer.from_psmiles(psmiles)
features = sidechain_multi_featurizer.featurize(polymer)
labels = sidechain_multi_featurizer.feature_labels()
for label, value in zip(labels, features):
print(f"{label}: {value:.2f}")
sidechain_featurizers = [
NumSideChainFeaturizer(),
SideChainFeaturizer(NumAtoms(agg=["sum"])),
SideChainFeaturizer(NumHBondDonors(agg=["sum"])),
SideChainFeaturizer(NumRotatableBonds(agg=["sum"])),
]
sidechain_multi_featurizer = MultipleFeaturizer(sidechain_featurizers)
for psmiles in psmiles_list:
polymer = Polymer.from_psmiles(psmiles)
features = sidechain_multi_featurizer.featurize(polymer)
labels = sidechain_multi_featurizer.feature_labels()
for label, value in zip(labels, features):
print(f"{label}: {value:.2f}")
numsidechainfeaturizer: 1.00 num_atoms_sidechainfeaturizer_sum: 6.00 num_hbond_donors_sidechainfeaturizer_sum: 0.00 num_rotatable_bonds_sidechainfeaturizer_sum: 0.00 numsidechainfeaturizer: 1.00 num_atoms_sidechainfeaturizer_sum: 2.00 num_hbond_donors_sidechainfeaturizer_sum: 0.00 num_rotatable_bonds_sidechainfeaturizer_sum: 0.00
Backbone Featurization¶
In [5]:
Copied!
backbone_featurizers = [
NumBackBoneFeaturizer(),
BackBoneFeaturizer(NumRings()),
BackBoneFeaturizer(NumAtoms()),
BackBoneFeaturizer(TopologicalSurfaceArea()),
]
backbone_multi_featurizer = MultipleFeaturizer(backbone_featurizers)
for psmiles in psmiles_list:
polymer = Polymer.from_psmiles(psmiles)
features = backbone_multi_featurizer.featurize(polymer)
labels = backbone_multi_featurizer.feature_labels()
for label, value in zip(labels, features):
print(f"{label}: {value:.2f}")
backbone_featurizers = [
NumBackBoneFeaturizer(),
BackBoneFeaturizer(NumRings()),
BackBoneFeaturizer(NumAtoms()),
BackBoneFeaturizer(TopologicalSurfaceArea()),
]
backbone_multi_featurizer = MultipleFeaturizer(backbone_featurizers)
for psmiles in psmiles_list:
polymer = Polymer.from_psmiles(psmiles)
features = backbone_multi_featurizer.featurize(polymer)
labels = backbone_multi_featurizer.feature_labels()
for label, value in zip(labels, features):
print(f"{label}: {value:.2f}")
numbackbonefeaturizer: 1.00 num_rings_sum_backbonefeaturizer: 0.00 num_atoms_sum_backbonefeaturizer: 5.00 topological_surface_area_sum_backbonefeaturizer: 9.23 numbackbonefeaturizer: 1.00 num_rings_sum_backbonefeaturizer: 0.00 num_atoms_sum_backbonefeaturizer: 6.00 topological_surface_area_sum_backbonefeaturizer: 0.00
Full Molecular Featurization¶
In [6]:
Copied!
psmiles_list = [
"CCCC",
"NC(=O)c1ccc2c(c1)nc(C1CCC(O)CC1)n2CCCO",
"CNC(=S)Nc1cccc(-c2cnc3ccccc3n2)c1",
"C#Cc1ccc(-c2nc(-c3cc[nH]c(=O)c3)c(-c3ccc(F)cc3)[nH]2)cc1",
]
full_featurizers = [
FullMolecularFeaturizer(NumRings()),
FullMolecularFeaturizer(MolecularWeight()),
FullMolecularFeaturizer(TopologicalSurfaceArea()),
]
full_multi_featurizer = MultipleFeaturizer(full_featurizers)
for psmiles in psmiles_list:
polymer = Molecule.from_smiles(psmiles)
features = full_multi_featurizer.featurize(polymer)
labels = full_multi_featurizer.feature_labels()
for label, value in zip(labels, features):
print(f"{label}: {value:.2f}")
psmiles_list = [
"CCCC",
"NC(=O)c1ccc2c(c1)nc(C1CCC(O)CC1)n2CCCO",
"CNC(=S)Nc1cccc(-c2cnc3ccccc3n2)c1",
"C#Cc1ccc(-c2nc(-c3cc[nH]c(=O)c3)c(-c3ccc(F)cc3)[nH]2)cc1",
]
full_featurizers = [
FullMolecularFeaturizer(NumRings()),
FullMolecularFeaturizer(MolecularWeight()),
FullMolecularFeaturizer(TopologicalSurfaceArea()),
]
full_multi_featurizer = MultipleFeaturizer(full_featurizers)
for psmiles in psmiles_list:
polymer = Molecule.from_smiles(psmiles)
features = full_multi_featurizer.featurize(polymer)
labels = full_multi_featurizer.feature_labels()
for label, value in zip(labels, features):
print(f"{label}: {value:.2f}")
num_rings_sum_fullmolecularfeaturizer: 0.00 molecular_weight_sum_fullmolecularfeaturizer: 58.08 topological_surface_area_sum_fullmolecularfeaturizer: 0.00 num_rings_sum_fullmolecularfeaturizer: 3.00 molecular_weight_sum_fullmolecularfeaturizer: 317.17 topological_surface_area_sum_fullmolecularfeaturizer: 101.37 num_rings_sum_fullmolecularfeaturizer: 3.00 molecular_weight_sum_fullmolecularfeaturizer: 294.09 topological_surface_area_sum_fullmolecularfeaturizer: 49.84 num_rings_sum_fullmolecularfeaturizer: 4.00 molecular_weight_sum_fullmolecularfeaturizer: 355.11 topological_surface_area_sum_fullmolecularfeaturizer: 61.54
Using Comparators to Compare Polymer and Molecule Features¶
In [7]:
Copied!
polymer = Polymer.from_psmiles('*CCCCCCNC(=O)c1ccc(C(=O)N*)c(Sc2ccccc2)c1')
molecule = Molecule.from_smiles('CC(=O)OC1=CC=CC=C1C(=O)O')
polymer_featurizers = [
FullPolymerFeaturizer(MolecularWeight()),
FullPolymerFeaturizer(NumHBondDonors()),
FullPolymerFeaturizer(NumHBondAcceptors()),
FullPolymerFeaturizer(NumRotatableBonds())
]
molecule_featurizers = [
FullMolecularFeaturizer(MolecularWeight()),
FullMolecularFeaturizer(NumHBondDonors()),
FullMolecularFeaturizer(NumHBondAcceptors()),
FullMolecularFeaturizer(NumRotatableBonds())
]
polymer_multi = MultipleFeaturizer(polymer_featurizers)
molecule_multi = MultipleFeaturizer(molecule_featurizers)
# comparator = PolymerMoleculeComparator(polymer_multi, molecule_multi)
comparator = PolymerMoleculeComparator(
polymer_multi,
molecule_multi,
comparisons=["absolute_difference", "signed_difference", "product", "squared_distance", "euclidean_distance"],
agg=["mean", "max", "min", "sum"]
)
difference = comparator.compare(polymer, molecule)
labels = comparator.feature_labels()
# Print feature-wise results
for label, diff in zip(labels, difference):
print(f" {label}: {diff}")
polymer = Polymer.from_psmiles('*CCCCCCNC(=O)c1ccc(C(=O)N*)c(Sc2ccccc2)c1')
molecule = Molecule.from_smiles('CC(=O)OC1=CC=CC=C1C(=O)O')
polymer_featurizers = [
FullPolymerFeaturizer(MolecularWeight()),
FullPolymerFeaturizer(NumHBondDonors()),
FullPolymerFeaturizer(NumHBondAcceptors()),
FullPolymerFeaturizer(NumRotatableBonds())
]
molecule_featurizers = [
FullMolecularFeaturizer(MolecularWeight()),
FullMolecularFeaturizer(NumHBondDonors()),
FullMolecularFeaturizer(NumHBondAcceptors()),
FullMolecularFeaturizer(NumRotatableBonds())
]
polymer_multi = MultipleFeaturizer(polymer_featurizers)
molecule_multi = MultipleFeaturizer(molecule_featurizers)
# comparator = PolymerMoleculeComparator(polymer_multi, molecule_multi)
comparator = PolymerMoleculeComparator(
polymer_multi,
molecule_multi,
comparisons=["absolute_difference", "signed_difference", "product", "squared_distance", "euclidean_distance"],
agg=["mean", "max", "min", "sum"]
)
difference = comparator.compare(polymer, molecule)
labels = comparator.feature_labels()
# Print feature-wise results
for label, diff in zip(labels, difference):
print(f" {label}: {diff}")
molecular_weight_sum_fullpolymerfeaturizer_absolute_difference: 174.097940208 num_hbond_donors_sum_fullpolymerfeaturizer_absolute_difference: 1.0 num_hbond_acceptors_sum_fullpolymerfeaturizer_absolute_difference: 0.0 num_rotatable_bonds_sum_fullpolymerfeaturizer_absolute_difference: 8.0 molecular_weight_sum_fullpolymerfeaturizer_signed_difference: 174.097940208 num_hbond_donors_sum_fullpolymerfeaturizer_signed_difference: 1.0 num_hbond_acceptors_sum_fullpolymerfeaturizer_signed_difference: 0.0 num_rotatable_bonds_sum_fullpolymerfeaturizer_signed_difference: 8.0 molecular_weight_sum_fullpolymerfeaturizer_product: 63760.20132709417 num_hbond_donors_sum_fullpolymerfeaturizer_product: 2.0 num_hbond_acceptors_sum_fullpolymerfeaturizer_product: 9.0 num_rotatable_bonds_sum_fullpolymerfeaturizer_product: 20.0 molecular_weight_sum_fullpolymerfeaturizer_squared_distance: 30310.092784668348 num_hbond_donors_sum_fullpolymerfeaturizer_squared_distance: 1.0 num_hbond_acceptors_sum_fullpolymerfeaturizer_squared_distance: 0.0 num_rotatable_bonds_sum_fullpolymerfeaturizer_squared_distance: 64.0 molecular_weight_sum_fullpolymerfeaturizer_euclidean_distance: 174.097940208 num_hbond_donors_sum_fullpolymerfeaturizer_euclidean_distance: 1.0 num_hbond_acceptors_sum_fullpolymerfeaturizer_euclidean_distance: 0.0 num_rotatable_bonds_sum_fullpolymerfeaturizer_euclidean_distance: 8.0 molecular_weight_sum_fullpolymerfeaturizer_absolute_difference_signed_difference_product_squared_distance_euclidean_distance_mean: 18918.5175864773 num_hbond_donors_sum_fullpolymerfeaturizer_absolute_difference_signed_difference_product_squared_distance_euclidean_distance_mean: 1.2 num_hbond_acceptors_sum_fullpolymerfeaturizer_absolute_difference_signed_difference_product_squared_distance_euclidean_distance_mean: 1.8 num_rotatable_bonds_sum_fullpolymerfeaturizer_absolute_difference_signed_difference_product_squared_distance_euclidean_distance_mean: 21.6 molecular_weight_sum_fullpolymerfeaturizer_absolute_difference_signed_difference_product_squared_distance_euclidean_distance_max: 63760.20132709417 num_hbond_donors_sum_fullpolymerfeaturizer_absolute_difference_signed_difference_product_squared_distance_euclidean_distance_max: 2.0 num_hbond_acceptors_sum_fullpolymerfeaturizer_absolute_difference_signed_difference_product_squared_distance_euclidean_distance_max: 9.0 num_rotatable_bonds_sum_fullpolymerfeaturizer_absolute_difference_signed_difference_product_squared_distance_euclidean_distance_max: 64.0 molecular_weight_sum_fullpolymerfeaturizer_absolute_difference_signed_difference_product_squared_distance_euclidean_distance_min: 174.097940208 num_hbond_donors_sum_fullpolymerfeaturizer_absolute_difference_signed_difference_product_squared_distance_euclidean_distance_min: 1.0 num_hbond_acceptors_sum_fullpolymerfeaturizer_absolute_difference_signed_difference_product_squared_distance_euclidean_distance_min: 0.0 num_rotatable_bonds_sum_fullpolymerfeaturizer_absolute_difference_signed_difference_product_squared_distance_euclidean_distance_min: 8.0 molecular_weight_sum_fullpolymerfeaturizer_absolute_difference_signed_difference_product_squared_distance_euclidean_distance_sum: 94592.58793238651 num_hbond_donors_sum_fullpolymerfeaturizer_absolute_difference_signed_difference_product_squared_distance_euclidean_distance_sum: 6.0 num_hbond_acceptors_sum_fullpolymerfeaturizer_absolute_difference_signed_difference_product_squared_distance_euclidean_distance_sum: 9.0 num_rotatable_bonds_sum_fullpolymerfeaturizer_absolute_difference_signed_difference_product_squared_distance_euclidean_distance_sum: 108.0
Adding Terminal Groups to Polymers¶
In [8]:
Copied!
polymer = Polymer.from_psmiles("[*]=C(C#N)NC(=O)c1ccc(C(=O)NC(=[*])C#N)cc1")
# Add terminal groups for the sidechain and backbone
polymer.backbone_terminal_groups = {"[*]": "*O"}
polymer.sidechain_terminal_groups = {"[*]": "*CCO"}
polymer = Polymer.from_psmiles("[*]=C(C#N)NC(=O)c1ccc(C(=O)NC(=[*])C#N)cc1")
# Add terminal groups for the sidechain and backbone
polymer.backbone_terminal_groups = {"[*]": "*O"}
polymer.sidechain_terminal_groups = {"[*]": "*CCO"}
In [9]:
Copied!
backbone_featurizers = [
NumBackBoneFeaturizer(),
BackBoneFeaturizer(NumRings()),
BackBoneFeaturizer(NumAtoms()),
BackBoneFeaturizer(TopologicalSurfaceArea()),
]
backbone_multi_featurizer = MultipleFeaturizer(backbone_featurizers)
features = backbone_multi_featurizer.featurize(polymer)
labels = backbone_multi_featurizer.feature_labels()
# Print labels and features
for label, value in zip(labels, features):
print(f"{label}: {value:.2f}")
backbone_featurizers = [
NumBackBoneFeaturizer(),
BackBoneFeaturizer(NumRings()),
BackBoneFeaturizer(NumAtoms()),
BackBoneFeaturizer(TopologicalSurfaceArea()),
]
backbone_multi_featurizer = MultipleFeaturizer(backbone_featurizers)
features = backbone_multi_featurizer.featurize(polymer)
labels = backbone_multi_featurizer.feature_labels()
# Print labels and features
for label, value in zip(labels, features):
print(f"{label}: {value:.2f}")
numbackbonefeaturizer: 1.00 num_rings_sum_with_terminalgroups_backbonefeaturizer: 1.00 num_atoms_sum_with_terminalgroups_backbonefeaturizer: 16.00 topological_surface_area_sum_with_terminalgroups_backbonefeaturizer: 92.34
In [10]:
Copied!
sidechain_featurizers = [
NumSideChainFeaturizer(),
SideChainFeaturizer(NumAtoms()),
SideChainFeaturizer(NumHBondDonors()),
SideChainFeaturizer(TopologicalSurfaceArea()),
]
sidechain_multi_featurizer = MultipleFeaturizer(sidechain_featurizers)
features = sidechain_multi_featurizer.featurize(polymer)
labels = sidechain_multi_featurizer.feature_labels()
# Print labels and features
for label, value in zip(labels, features):
print(f"{label}: {value:.2f}")
sidechain_featurizers = [
NumSideChainFeaturizer(),
SideChainFeaturizer(NumAtoms()),
SideChainFeaturizer(NumHBondDonors()),
SideChainFeaturizer(TopologicalSurfaceArea()),
]
sidechain_multi_featurizer = MultipleFeaturizer(sidechain_featurizers)
features = sidechain_multi_featurizer.featurize(polymer)
labels = sidechain_multi_featurizer.feature_labels()
# Print labels and features
for label, value in zip(labels, features):
print(f"{label}: {value:.2f}")
numsidechainfeaturizer: 2.00 num_atoms_with_terminalgroups_sidechainfeaturizer_sum: 10.00 num_hbond_donors_with_terminalgroups_sidechainfeaturizer_sum: 2.00 topological_surface_area_with_terminalgroups_sidechainfeaturizer_sum: 88.04
In [11]:
Copied!
# Full polymer featurizers
full_polymer_featurizers = [
FullPolymerFeaturizer(NumAtoms()),
FullPolymerFeaturizer(NumHBondDonors()),
FullPolymerFeaturizer(TopologicalSurfaceArea()),
]
full_multi_featurizer = MultipleFeaturizer(full_polymer_featurizers)
features = full_multi_featurizer.featurize(polymer)
labels = full_multi_featurizer.feature_labels()
print("\nFull molecular features:")
for label, value in zip(labels, features):
print(f"{label}: {value:.2f}")
# Full polymer featurizers
full_polymer_featurizers = [
FullPolymerFeaturizer(NumAtoms()),
FullPolymerFeaturizer(NumHBondDonors()),
FullPolymerFeaturizer(TopologicalSurfaceArea()),
]
full_multi_featurizer = MultipleFeaturizer(full_polymer_featurizers)
features = full_multi_featurizer.featurize(polymer)
labels = full_multi_featurizer.feature_labels()
print("\nFull molecular features:")
for label, value in zip(labels, features):
print(f"{label}: {value:.2f}")
Full molecular features: num_atoms_sum_with_terminalgroups_fullpolymerfeaturizer: 20.00 num_hbond_donors_sum_with_terminalgroups_fullpolymerfeaturizer: 2.00 topological_surface_area_sum_with_terminalgroups_fullpolymerfeaturizer: 105.78