import os
import glob
import multiprocessing
import re
from io import BytesIO
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions
from tqdm import tqdm
from pebble import ProcessPool
from concurrent.futures import TimeoutError
# ==========================================
# WORKER FUNCTION (Executes in parallel)
# ==========================================
def process_single_block(mol_block_str):
supplier = Chem.ForwardSDMolSupplier(BytesIO(mol_block_str.encode('utf-8')))
try:
mol = next(supplier)
except StopIteration:
mol = None
if mol is None:
return ("read_error", None, None)
# 1. Criterion: Single molecule (no disconnected fragments)
if len(Chem.GetMolFrags(mol)) > 1:
return ("multi_frag", None, None)
# 2. Criterion: Molecular weight limits (100 to 700 Da)
mw = Descriptors.MolWt(mol)
if mw > 700.0:
return ("high_weight", None, None)
if mw < 100.0:
return ("low_weight", None, None)
# 3. Criterion: 1 or 2 Chiral Carbons (Chiral sublibrary)
chiral_centers = Chem.FindMolChiralCenters(mol, includeUnassigned=True)
chiral_carbons = [idx for idx, _ in chiral_centers if mol.GetAtomWithIdx(idx).GetAtomicNum() == 6]
chiral_carbon_count = len(chiral_carbons)
if chiral_carbon_count == 0:
return ("zero_chiral_carbons", None, None) # They belong to script 12
if chiral_carbon_count > 2:
return ("too_many_chiral", None, None) # Max 2 chiral carbons
# 4. Criterion: Price less than 50 USD per mg
if mol.HasProp("PRICERANGE_1MG"):
price_str = mol.GetProp("PRICERANGE_1MG")
match = re.search(r'\d+(\.\d+)?', price_str)
if match:
price = float(match.group())
if price >= 50.0:
return ("high_price", None, None)
else:
return ("invalid_price", None, None)
else:
return ("no_price", None, None)
# 5. Extract Base MolPort ID
if mol.HasProp("PUBCHEM_EXT_DATASOURCE_REGID"):
base_id = mol.GetProp("PUBCHEM_EXT_DATASOURCE_REGID")
else:
base_id = "Unknown_MolPort_ID"
# 6. --- 3D GENERATION AND STEREOISOMERS ---
stereo_opts = StereoEnumerationOptions(tryEmbedding=True, maxIsomers=4) # max 4 isomers for 2 centers
isomers = list(EnumerateStereoisomers(mol, options=stereo_opts))
valid_isomers_data = []
for idx, isomer in enumerate(isomers):
isomer_name = f"{base_id}_{idx+1}"
isomer_h = Chem.AddHs(isomer)
# 3D Embedding
res = AllChem.EmbedMolecule(isomer_h, randomSeed=42)
if res != 0:
res = AllChem.EmbedMolecule(isomer_h, useRandomCoords=True, randomSeed=42)
if res != 0:
continue # Skip this isomer if 3D generation totally fails
# Force Field Optimization
try:
AllChem.MMFFOptimizeMolecule(isomer_h, maxIters=200)
except:
pass
# Calculate R/S labels
try:
Chem.AssignStereochemistry(isomer_h, force=True, cleanIt=True)
centros_rs = Chem.FindMolChiralCenters(isomer_h)
if centros_rs:
texto_quiral = " | ".join([f"C{i}: {rs}" for i, rs in centros_rs])
isomer_h.SetProp("CHIRALITY", texto_quiral)
else:
isomer_h.SetProp("CHIRALITY", "No defined centers")
except:
pass
isomer_h.SetProp("_Name", isomer_name)
# Pack the binary data for the main thread
valid_isomers_data.append(isomer_h.ToBinary())
if not valid_isomers_data:
return ("3d_error", None, None) # No valid 3D isomers were generated
return ("ok", valid_isomers_data, base_id)
# ==========================================
# TEXT READER (Yields SDF blocks)
# ==========================================
def sdf_block_generator(filepath):
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
block = []
for line in f:
block.append(line)
if line.strip() == "$$$$":
yield "".join(block)
block = []
# ==========================================
# MAIN FUNCTION
# ==========================================
def process_molport_database():
# DIRECTORIES FOR LINUX
input_dir = "./MolPort-database-v8"
output_dir = "./MolPort-database-v8_SDF500-CHIRAL"
os.makedirs(output_dir, exist_ok=True)
sdf_files = glob.glob(os.path.join(input_dir, "*.sdf"))
if not sdf_files:
print(f"No .sdf files found in {input_dir}")
return
mols_per_file = 500
file_counter = 1
current_mol_count = 0
stats = {
"read": 0, "isomers_saved": 0, "discarded_multi_frag": 0,
"discarded_high_weight": 0, "discarded_low_weight": 0,
"discarded_zero_chiral_carbons": 0, "discarded_too_many_chiral": 0,
"discarded_high_price": 0, "discarded_no_price": 0,
"discarded_invalid_price": 0, "discarded_3d_error": 0,
"discarded_kekulize": 0, "discarded_timeout": 0, "read_errors": 0
}
current_out_filepath = os.path.join(output_dir, f"MPv8_Chiral-{file_counter:04d}.sdf")
writer = Chem.SDWriter(current_out_filepath)
cpu_cores = multiprocessing.cpu_count()
print(f"Starting Chiral Processing. Cores: {cpu_cores}. MW limits: 100-700. Price < 50. Max Chiral: 2.")
with ProcessPool(max_workers=cpu_cores) as pool:
for filepath in sdf_files:
filename = os.path.basename(filepath)
blocks_iterator = sdf_block_generator(filepath)
future = pool.map(process_single_block, blocks_iterator, timeout=40, chunksize=10)
iterator = future.result()
with tqdm(desc=f"Processing {filename}") as pbar:
while True:
try:
status, isomers_bin_list, _ = next(iterator)
stats["read"] += 1
pbar.update(1)
if status == "ok":
# Process the list of valid stereoisomers for this molecule
for mol_bin in isomers_bin_list:
mol = Chem.Mol(mol_bin)
# --- THE SOLUTION TO THE KEKULIZE ERROR ---
try:
writer.write(mol)
current_mol_count += 1
stats["isomers_saved"] += 1
except Exception:
# Silently ignore Kekulize or sanitization writing errors
stats["discarded_kekulize"] += 1
continue
# ----------------------------------------
if current_mol_count == mols_per_file:
writer.close()
file_counter += 1
current_mol_count = 0
current_out_filepath = os.path.join(output_dir, f"MPv8_Chiral-{file_counter:04d}.sdf")
writer = Chem.SDWriter(current_out_filepath)
else:
if status == "read_error":
stats["read_errors"] += 1
else:
stats[f"discarded_{status}"] += 1
except StopIteration:
break
except TimeoutError:
stats["read"] += 1
stats["discarded_timeout"] += 1
pbar.update(1)
except Exception as error:
stats["read"] += 1
stats["read_errors"] += 1
pbar.update(1)
if writer is not None:
writer.close()
if current_mol_count == 0 and os.path.exists(current_out_filepath):
os.remove(current_out_filepath)
print("\n" + "="*50)
print("CHIRAL PROCESSING SUMMARY")
print("="*50)
print(f"Total base molecules read: {stats['read']}")
print(f"Total Stereoisomers SAVED: {stats['isomers_saved']}")
print("-" * 50)
print("Base Molecules Discarded:")
print(f" - Zero Chiral Carbons: {stats['discarded_zero_chiral_carbons']}")
print(f" - >2 Chiral Carbons: {stats['discarded_too_many_chiral']}")
print(f" - Multiple fragments: {stats['discarded_multi_frag']}")
print(f" - MW > 700 Da: {stats['discarded_high_weight']}")
print(f" - MW < 100 Da: {stats['discarded_low_weight']}")
print(f" - Price >= 50 USD: {stats['discarded_high_price']}")
print(f" - Missing/Invalid price: {stats['discarded_no_price'] + stats['discarded_invalid_price']}")
print(f" - 3D Generation totally failed:{stats['discarded_3d_error']}")
print(f" - Timeout exceeded (40s): {stats['discarded_timeout']}")
print(f" - RDKit read errors: {stats['read_errors']}")
print("-" * 50)
print(f"Individual Isomers Discarded:")
print(f" - By Kekulization/Write error: {stats['discarded_kekulize']}")
print("="*50)
if __name__ == "__main__":
process_molport_database()