Python script to build "MolPort chiral carbon" library

#!/bin/bash
#SBATCH --job-name=MP8-Q
#SBATCH --qos=normal
#SBATCH --cpus-per-task=32
#SBATCH --mem=64G
#SBATCH --time=7-0
#SBATCH --output=/home/jant.encinar/MolPort-db-Quiral/MP8-Q_salida-%j.out
#SBATCH --error=/home/jant.encinar/MolPort-db-Quiral/MP8-Q_error-%j.err

# Load the necessary Python/Conda environment
source /home/jant.encinar/anaconda3/etc/profile.d/conda.sh

# Name of my environment with RDkit installed
conda activate MolPort-db

# Run the script
python3 /home/jant.encinar/MolPort-db-Quiral/script-number-131.py
"script-number-131": Python script for selection of the "MolPort chiral carbon" library.
import os
import glob
import multiprocessing
import re
from io import BytesIO
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions
from tqdm import tqdm
from pebble import ProcessPool
from concurrent.futures import TimeoutError

# ==========================================
# WORKER FUNCTION (Executes in parallel)
# ==========================================
def process_single_block(mol_block_str):
    supplier = Chem.ForwardSDMolSupplier(BytesIO(mol_block_str.encode('utf-8')))
    try:
        mol = next(supplier)
    except StopIteration:
        mol = None

    if mol is None:
        return ("read_error", None, None)
    
    # 1. Criterion: Single molecule (no disconnected fragments)
    if len(Chem.GetMolFrags(mol)) > 1:
        return ("multi_frag", None, None)
        
    # 2. Criterion: Molecular weight limits (100 to 700 Da)
    mw = Descriptors.MolWt(mol)
    if mw > 700.0:
        return ("high_weight", None, None)
    if mw < 100.0:
        return ("low_weight", None, None)
        
    # 3. Criterion: 1 or 2 Chiral Carbons (Chiral sublibrary)
    chiral_centers = Chem.FindMolChiralCenters(mol, includeUnassigned=True)
    chiral_carbons = [idx for idx, _ in chiral_centers if mol.GetAtomWithIdx(idx).GetAtomicNum() == 6]
    chiral_carbon_count = len(chiral_carbons)

    if chiral_carbon_count == 0:
        return ("zero_chiral_carbons", None, None) # They belong to script 12
    if chiral_carbon_count > 2:
        return ("too_many_chiral", None, None) # Max 2 chiral carbons
            
    # 4. Criterion: Price less than 50 USD per mg
    if mol.HasProp("PRICERANGE_1MG"):
        price_str = mol.GetProp("PRICERANGE_1MG")
        match = re.search(r'\d+(\.\d+)?', price_str)
        if match:
            price = float(match.group())
            if price >= 50.0:
                return ("high_price", None, None)
        else:
            return ("invalid_price", None, None) 
    else:
        return ("no_price", None, None)

    # 5. Extract Base MolPort ID
    if mol.HasProp("PUBCHEM_EXT_DATASOURCE_REGID"):
        base_id = mol.GetProp("PUBCHEM_EXT_DATASOURCE_REGID")
    else:
        base_id = "Unknown_MolPort_ID"

    # 6. --- 3D GENERATION AND STEREOISOMERS ---
    stereo_opts = StereoEnumerationOptions(tryEmbedding=True, maxIsomers=4) # max 4 isomers for 2 centers
    isomers = list(EnumerateStereoisomers(mol, options=stereo_opts))
    
    valid_isomers_data = []
    
    for idx, isomer in enumerate(isomers):
        isomer_name = f"{base_id}_{idx+1}"
        isomer_h = Chem.AddHs(isomer)
        
        # 3D Embedding
        res = AllChem.EmbedMolecule(isomer_h, randomSeed=42)
        if res != 0:
            res = AllChem.EmbedMolecule(isomer_h, useRandomCoords=True, randomSeed=42)
            if res != 0: 
                continue # Skip this isomer if 3D generation totally fails
                
        # Force Field Optimization
        try:
            AllChem.MMFFOptimizeMolecule(isomer_h, maxIters=200)
        except:
            pass
            
        # Calculate R/S labels
        try:
            Chem.AssignStereochemistry(isomer_h, force=True, cleanIt=True)
            centros_rs = Chem.FindMolChiralCenters(isomer_h)
            if centros_rs:
                texto_quiral = " | ".join([f"C{i}: {rs}" for i, rs in centros_rs])
                isomer_h.SetProp("CHIRALITY", texto_quiral)
            else:
                isomer_h.SetProp("CHIRALITY", "No defined centers")
        except:
            pass
            
        isomer_h.SetProp("_Name", isomer_name)
        
        # Pack the binary data for the main thread
        valid_isomers_data.append(isomer_h.ToBinary())

    if not valid_isomers_data:
        return ("3d_error", None, None) # No valid 3D isomers were generated

    return ("ok", valid_isomers_data, base_id)

# ==========================================
# TEXT READER (Yields SDF blocks)
# ==========================================
def sdf_block_generator(filepath):
    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
        block = []
        for line in f:
            block.append(line)
            if line.strip() == "$$$$":
                yield "".join(block)
                block = []

# ==========================================
# MAIN FUNCTION
# ==========================================
def process_molport_database():
    # DIRECTORIES FOR LINUX
    input_dir = "./MolPort-database-v8"
    output_dir = "./MolPort-database-v8_SDF500-CHIRAL"
    
    os.makedirs(output_dir, exist_ok=True)
    sdf_files = glob.glob(os.path.join(input_dir, "*.sdf"))
    
    if not sdf_files:
        print(f"No .sdf files found in {input_dir}")
        return

    mols_per_file = 500
    file_counter = 1
    current_mol_count = 0
    
    stats = {
        "read": 0, "isomers_saved": 0, "discarded_multi_frag": 0,
        "discarded_high_weight": 0, "discarded_low_weight": 0, 
        "discarded_zero_chiral_carbons": 0, "discarded_too_many_chiral": 0, 
        "discarded_high_price": 0, "discarded_no_price": 0, 
        "discarded_invalid_price": 0, "discarded_3d_error": 0, 
        "discarded_kekulize": 0, "discarded_timeout": 0, "read_errors": 0
    }
    
    current_out_filepath = os.path.join(output_dir, f"MPv8_Chiral-{file_counter:04d}.sdf")
    writer = Chem.SDWriter(current_out_filepath)
    cpu_cores = multiprocessing.cpu_count()
    
    print(f"Starting Chiral Processing. Cores: {cpu_cores}. MW limits: 100-700. Price < 50. Max Chiral: 2.")
    
    with ProcessPool(max_workers=cpu_cores) as pool:
        for filepath in sdf_files:
            filename = os.path.basename(filepath)
            blocks_iterator = sdf_block_generator(filepath)
            
            future = pool.map(process_single_block, blocks_iterator, timeout=40, chunksize=10)
            iterator = future.result()
            
            with tqdm(desc=f"Processing {filename}") as pbar:
                while True:
                    try:
                        status, isomers_bin_list, _ = next(iterator)
                        stats["read"] += 1
                        pbar.update(1)
                        
                        if status == "ok":
                            # Process the list of valid stereoisomers for this molecule
                            for mol_bin in isomers_bin_list:
                                mol = Chem.Mol(mol_bin)
                                
                                # --- THE SOLUTION TO THE KEKULIZE ERROR ---
                                try:
                                    writer.write(mol)
                                    current_mol_count += 1
                                    stats["isomers_saved"] += 1
                                except Exception:
                                    # Silently ignore Kekulize or sanitization writing errors
                                    stats["discarded_kekulize"] += 1
                                    continue
                                # ----------------------------------------
                                
                                if current_mol_count == mols_per_file:
                                    writer.close()
                                    file_counter += 1
                                    current_mol_count = 0
                                    current_out_filepath = os.path.join(output_dir, f"MPv8_Chiral-{file_counter:04d}.sdf")
                                    writer = Chem.SDWriter(current_out_filepath)
                        else:
                            if status == "read_error":
                                stats["read_errors"] += 1
                            else:
                                stats[f"discarded_{status}"] += 1
                                
                    except StopIteration:
                        break 
                    except TimeoutError:
                        stats["read"] += 1
                        stats["discarded_timeout"] += 1
                        pbar.update(1)
                    except Exception as error:
                        stats["read"] += 1
                        stats["read_errors"] += 1
                        pbar.update(1)

    if writer is not None:
        writer.close()
        if current_mol_count == 0 and os.path.exists(current_out_filepath):
             os.remove(current_out_filepath)
             
    print("\n" + "="*50)
    print("CHIRAL PROCESSING SUMMARY")
    print("="*50)
    print(f"Total base molecules read:       {stats['read']}")
    print(f"Total Stereoisomers SAVED:       {stats['isomers_saved']}")
    print("-" * 50)
    print("Base Molecules Discarded:")
    print(f"  - Zero Chiral Carbons:         {stats['discarded_zero_chiral_carbons']}")
    print(f"  - >2 Chiral Carbons:           {stats['discarded_too_many_chiral']}")
    print(f"  - Multiple fragments:          {stats['discarded_multi_frag']}")
    print(f"  - MW > 700 Da:                 {stats['discarded_high_weight']}")
    print(f"  - MW < 100 Da:                 {stats['discarded_low_weight']}")
    print(f"  - Price >= 50 USD:             {stats['discarded_high_price']}")
    print(f"  - Missing/Invalid price:       {stats['discarded_no_price'] + stats['discarded_invalid_price']}")
    print(f"  - 3D Generation totally failed:{stats['discarded_3d_error']}")
    print(f"  - Timeout exceeded (40s):      {stats['discarded_timeout']}")
    print(f"  - RDKit read errors:           {stats['read_errors']}")
    print("-" * 50)
    print(f"Individual Isomers Discarded:")
    print(f"  - By Kekulization/Write error: {stats['discarded_kekulize']}")
    print("="*50)

if __name__ == "__main__":
    process_molport_database()