import os
import glob
import multiprocessing
import re
from io import BytesIO
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from tqdm import tqdm
from pebble import ProcessPool
from concurrent.futures import TimeoutError
# ==========================================
# WORKER FUNCTION (Executes in parallel)
# ==========================================
def process_single_block(mol_block_str):
# RDKit's ForwardSDMolSupplier automatically parses SDF data fields (lines starting with >)
# into molecule properties that can be accessed via GetProp().
supplier = Chem.ForwardSDMolSupplier(BytesIO(mol_block_str.encode('utf-8')))
try:
mol = next(supplier)
except StopIteration:
mol = None
if mol is None:
return ("read_error", None, None)
# 1. Criterion: Single molecule (no disconnected fragments/salts)
if len(Chem.GetMolFrags(mol)) > 1:
return ("multi_frag", None, None)
# 2. Criterion: Molecular weight limits (e.g., between 100 and 700 Da)
mw = Descriptors.MolWt(mol)
if mw > 700.0:
return ("high_weight", None, None)
if mw < 100.0:
return ("low_weight", None, None)
# 3. Criterion: No chiral carbons (Nonchiral sublibrary)
chiral_centers = Chem.FindMolChiralCenters(mol, includeUnassigned=True)
for idx, _ in chiral_centers:
if mol.GetAtomWithIdx(idx).GetAtomicNum() == 6:
return ("chiral_carbons", None, None)
# 4. Criterion: Price less than 50 USD per mg
# RDKit automatically parses SDF data blocks into Properties
if mol.HasProp("PRICERANGE_1MG"):
price_str = mol.GetProp("PRICERANGE_1MG")
# Use regex to extract the numeric value (e.g., from "45 USD" or "45.0")
match = re.search(r'\d+(\.\d+)?', price_str)
if match:
price = float(match.group())
if price >= 50.0:
return ("high_price", None, None)
else:
return ("invalid_price", None, None)
else:
return ("no_price", None, None) # Reject if no price data is available
# 5. Extract MolPort ID
if mol.HasProp("PUBCHEM_EXT_DATASOURCE_REGID"):
mol_name = mol.GetProp("PUBCHEM_EXT_DATASOURCE_REGID")
else:
mol_name = "Unknown_MolPort_ID"
# 6. Generate 3D coordinates and add Hydrogens
mol_with_h = Chem.AddHs(mol)
res = AllChem.EmbedMolecule(mol_with_h, randomSeed=42, maxAttempts=1000)
if res == -1:
return ("3d_error", None, None)
# IMPORTANT: Return 3 items (Status, Binary molecule, Extracted name)
return ("ok", mol_with_h.ToBinary(), mol_name)
# ==========================================
# TEXT READER (Yields SDF blocks)
# ==========================================
def sdf_block_generator(filepath):
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
block = []
for line in f:
block.append(line)
if line.strip() == "$$$$":
yield "".join(block)
block = []
# ==========================================
# MAIN FUNCTION
# ==========================================
def process_molport_database():
input_dir = "/home/jant.encinar/MolPort-2026-03-All-Stock-Compounds"
output_dir = "/home/jant.encinar/MolPort-2026-03-All-Stock-Compounds-nonchiral-carbon"
os.makedirs(output_dir, exist_ok=True)
sdf_files = glob.glob(os.path.join(input_dir, "*.sdf"))
if not sdf_files:
print(f"No .sdf files found in {input_dir}")
return
mols_per_file = 500
file_counter = 1
current_mol_count = 0
stats = {
"read": 0, "saved": 0, "discarded_multi_frag": 0,
"discarded_high_weight": 0, "discarded_low_weight": 0,
"discarded_chiral_carbons": 0, "discarded_high_price": 0,
"discarded_no_price": 0, "discarded_invalid_price": 0,
"discarded_3d_error": 0, "discarded_timeout": 0, "read_errors": 0
}
current_out_filepath = os.path.join(output_dir, f"MPv8-{file_counter:04d}.sdf")
writer = Chem.SDWriter(current_out_filepath)
cpu_cores = multiprocessing.cpu_count()
print(f"Starting. CPU cores: {cpu_cores}. Timeout: 30s. MW limits: 100-700 Da. Price < 50 USD.")
with ProcessPool(max_workers=cpu_cores) as pool:
for filepath in sdf_files:
filename = os.path.basename(filepath)
blocks_iterator = sdf_block_generator(filepath)
future = pool.map(process_single_block, blocks_iterator, timeout=30, chunksize=10)
iterator = future.result()
with tqdm(desc=f"Processing {filename}") as pbar:
while True:
try:
# Unpack the 3 variables
status, mol_bin, mol_name = next(iterator)
stats["read"] += 1
pbar.update(1)
if status == "ok":
mol = Chem.Mol(mol_bin)
# RE-ASSIGN THE NAME right before saving
mol.SetProp("_Name", mol_name)
writer.write(mol)
current_mol_count += 1
stats["saved"] += 1
if current_mol_count == mols_per_file:
writer.close()
file_counter += 1
current_mol_count = 0
current_out_filepath = os.path.join(output_dir, f"MPv8-{file_counter:04d}.sdf")
writer = Chem.SDWriter(current_out_filepath)
else:
if status == "read_error":
stats["read_errors"] += 1
else:
stats[f"discarded_{status}"] += 1
except StopIteration:
break
except TimeoutError:
stats["read"] += 1
stats["discarded_timeout"] += 1
pbar.update(1)
except Exception as error:
stats["read"] += 1
stats["read_errors"] += 1
pbar.update(1)
if writer is not None:
writer.close()
if current_mol_count == 0 and os.path.exists(current_out_filepath):
os.remove(current_out_filepath)
print("\n" + "="*50)
print("PROCESSING SUMMARY")
print("="*50)
print(f"Total molecules read: {stats['read']}")
print(f"Molecules successfully saved: {stats['saved']}")
print("-" * 50)
print("Discarded:")
print(f" - Multiple fragments: {stats['discarded_multi_frag']}")
print(f" - MW > 700 Da: {stats['discarded_high_weight']}")
print(f" - MW < 100 Da: {stats['discarded_low_weight']}")
print(f" - Contains chiral carbons: {stats['discarded_chiral_carbons']}")
print(f" - Price >= 50 USD: {stats['discarded_high_price']}")
print(f" - Missing or invalid price: {stats['discarded_no_price'] + stats['discarded_invalid_price']}")
print(f" - 3D generation failure: {stats['discarded_3d_error']}")
print(f" - Timeout exceeded (30s): {stats['discarded_timeout']}")
print(f" - RDKit read errors: {stats['read_errors']}")
print("="*50)
if __name__ == "__main__":
process_molport_database()