Python script to build "MolPort nonchiral carbon" library

#!/bin/bash
#SBATCH --job-name=MP8-noQ
#SBATCH --qos=normal
#SBATCH --cpus-per-task=32
#SBATCH --mem=64G
#SBATCH --time=7-0
#SBATCH --output=/home/jant.encinar/MolPort-db-noQuiral/MP8-noQ_salida-%j.out
#SBATCH --error=/home/jant.encinar/MolPort-db-noQuiral/MP8-noQ_error-%j.err

# Load the necessary Python/Conda environment
source /home/jant.encinar/anaconda3/etc/profile.d/conda.sh

# Name of my environment with RDkit installed
conda activate MolPort-db

# Run the script
python3 /home/jant.encinar/MolPort-db-noQuiral/script-number-130.py
"script-number-130": Python script for selection of the "MolPort nonchiral carbon" library.
import os
import glob
import multiprocessing
import re
from io import BytesIO
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from tqdm import tqdm
from pebble import ProcessPool
from concurrent.futures import TimeoutError

# ==========================================
# WORKER FUNCTION (Executes in parallel)
# ==========================================
def process_single_block(mol_block_str):
    # RDKit's ForwardSDMolSupplier automatically parses SDF data fields (lines starting with >)
    # into molecule properties that can be accessed via GetProp().
    supplier = Chem.ForwardSDMolSupplier(BytesIO(mol_block_str.encode('utf-8')))
    try:
        mol = next(supplier)
    except StopIteration:
        mol = None

    if mol is None:
        return ("read_error", None, None)
    
    # 1. Criterion: Single molecule (no disconnected fragments/salts)
    if len(Chem.GetMolFrags(mol)) > 1:
        return ("multi_frag", None, None)
        
    # 2. Criterion: Molecular weight limits (e.g., between 100 and 700 Da)
    mw = Descriptors.MolWt(mol)
    if mw > 700.0:
        return ("high_weight", None, None)
    if mw < 100.0:
        return ("low_weight", None, None)
        
    # 3. Criterion: No chiral carbons (Nonchiral sublibrary)
    chiral_centers = Chem.FindMolChiralCenters(mol, includeUnassigned=True)
    for idx, _ in chiral_centers:
        if mol.GetAtomWithIdx(idx).GetAtomicNum() == 6:
            return ("chiral_carbons", None, None) 
            
    # 4. Criterion: Price less than 50 USD per mg
    # RDKit automatically parses SDF data blocks into Properties
    if mol.HasProp("PRICERANGE_1MG"):
        price_str = mol.GetProp("PRICERANGE_1MG")
        # Use regex to extract the numeric value (e.g., from "45 USD" or "45.0")
        match = re.search(r'\d+(\.\d+)?', price_str)
        if match:
            price = float(match.group())
            if price >= 50.0:
                return ("high_price", None, None)
        else:
            return ("invalid_price", None, None) 
    else:
        return ("no_price", None, None) # Reject if no price data is available

    # 5. Extract MolPort ID
    if mol.HasProp("PUBCHEM_EXT_DATASOURCE_REGID"):
        mol_name = mol.GetProp("PUBCHEM_EXT_DATASOURCE_REGID")
    else:
        mol_name = "Unknown_MolPort_ID"

    # 6. Generate 3D coordinates and add Hydrogens
    mol_with_h = Chem.AddHs(mol)
    res = AllChem.EmbedMolecule(mol_with_h, randomSeed=42, maxAttempts=1000)
    
    if res == -1:
        return ("3d_error", None, None) 

    # IMPORTANT: Return 3 items (Status, Binary molecule, Extracted name)
    return ("ok", mol_with_h.ToBinary(), mol_name)

# ==========================================
# TEXT READER (Yields SDF blocks)
# ==========================================
def sdf_block_generator(filepath):
    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
        block = []
        for line in f:
            block.append(line)
            if line.strip() == "$$$$":
                yield "".join(block)
                block = []

# ==========================================
# MAIN FUNCTION
# ==========================================
def process_molport_database():
    input_dir = "/home/jant.encinar/MolPort-2026-03-All-Stock-Compounds"
    output_dir = "/home/jant.encinar/MolPort-2026-03-All-Stock-Compounds-nonchiral-carbon"
    
    os.makedirs(output_dir, exist_ok=True)
    sdf_files = glob.glob(os.path.join(input_dir, "*.sdf"))
    
    if not sdf_files:
        print(f"No .sdf files found in {input_dir}")
        return

    mols_per_file = 500
    file_counter = 1
    current_mol_count = 0
    
    stats = {
        "read": 0, "saved": 0, "discarded_multi_frag": 0,
        "discarded_high_weight": 0, "discarded_low_weight": 0, 
        "discarded_chiral_carbons": 0, "discarded_high_price": 0,
        "discarded_no_price": 0, "discarded_invalid_price": 0,
        "discarded_3d_error": 0, "discarded_timeout": 0, "read_errors": 0
    }
    
    current_out_filepath = os.path.join(output_dir, f"MPv8-{file_counter:04d}.sdf")
    writer = Chem.SDWriter(current_out_filepath)
    cpu_cores = multiprocessing.cpu_count()
    
    print(f"Starting. CPU cores: {cpu_cores}. Timeout: 30s. MW limits: 100-700 Da. Price < 50 USD.")
    
    with ProcessPool(max_workers=cpu_cores) as pool:
        for filepath in sdf_files:
            filename = os.path.basename(filepath)
            blocks_iterator = sdf_block_generator(filepath)
            
            future = pool.map(process_single_block, blocks_iterator, timeout=30, chunksize=10)
            iterator = future.result()
            
            with tqdm(desc=f"Processing {filename}") as pbar:
                while True:
                    try:
                        # Unpack the 3 variables
                        status, mol_bin, mol_name = next(iterator)
                        stats["read"] += 1
                        pbar.update(1)
                        
                        if status == "ok":
                            mol = Chem.Mol(mol_bin)
                            # RE-ASSIGN THE NAME right before saving
                            mol.SetProp("_Name", mol_name)
                            
                            writer.write(mol)
                            current_mol_count += 1
                            stats["saved"] += 1
                            
                            if current_mol_count == mols_per_file:
                                writer.close()
                                file_counter += 1
                                current_mol_count = 0
                                current_out_filepath = os.path.join(output_dir, f"MPv8-{file_counter:04d}.sdf")
                                writer = Chem.SDWriter(current_out_filepath)
                        else:
                            if status == "read_error":
                                stats["read_errors"] += 1
                            else:
                                stats[f"discarded_{status}"] += 1
                                
                    except StopIteration:
                        break 
                    except TimeoutError:
                        stats["read"] += 1
                        stats["discarded_timeout"] += 1
                        pbar.update(1)
                    except Exception as error:
                        stats["read"] += 1
                        stats["read_errors"] += 1
                        pbar.update(1)

    if writer is not None:
        writer.close()
        if current_mol_count == 0 and os.path.exists(current_out_filepath):
             os.remove(current_out_filepath)
             
    print("\n" + "="*50)
    print("PROCESSING SUMMARY")
    print("="*50)
    print(f"Total molecules read:            {stats['read']}")
    print(f"Molecules successfully saved:    {stats['saved']}")
    print("-" * 50)
    print("Discarded:")
    print(f"  - Multiple fragments:          {stats['discarded_multi_frag']}")
    print(f"  - MW > 700 Da:                 {stats['discarded_high_weight']}")
    print(f"  - MW < 100 Da:                 {stats['discarded_low_weight']}")
    print(f"  - Contains chiral carbons:     {stats['discarded_chiral_carbons']}")
    print(f"  - Price >= 50 USD:             {stats['discarded_high_price']}")
    print(f"  - Missing or invalid price:    {stats['discarded_no_price'] + stats['discarded_invalid_price']}")
    print(f"  - 3D generation failure:       {stats['discarded_3d_error']}")
    print(f"  - Timeout exceeded (30s):      {stats['discarded_timeout']}")
    print(f"  - RDKit read errors:           {stats['read_errors']}")
    print("="*50)

if __name__ == "__main__":
    process_molport_database()