Python script to generate SQLite database.

To run this script, you need to download the following comma-separated text files compressed with predictions of all parameters calculated by admetSAR 3.0 for the 2.2 million compounds in the MolPort library:

MPv8-noQ-01_admetSAR3.zip Size: 481 MB
MPv8-noQ-02_admetSAR3.zip Size: 483 MB
MPv8-noQ-03_admetSAR3.zip Size: 275 MB
MPv8-Quiral-01_admetSAR3.zip Size: 472 MB
MPv8-Quiral-02_admetSAR3.zip Size: 449 MB

"script-number-133": Python script for generating a SQLite database of MolPort compounds annotated with admetSAR 3.0 predictions.
import os
import glob
import csv
import sqlite3
import time

# Path configuration according to your environment
DIRECTORY = r"E:\admetSAR-dataset-csv-files\"  # Change this to your actual path
DB_PATH = os.path.join(DIRECTORY, "base-MolPort.db")
CSV_PATTERN = os.path.join(DIRECTORY, "*.csv")

# Exact list of the 120 provided columns
COLUMNS_STR = (
    "MolPort_ID,SMILES,MW,nAtom,nHet,nRing,nRot,HBA,HBD,TPSA,SlogP,logS,QED,"
    "Lipinski_rule,Pfizer_rule,GSK_rule,logP,pKa,Acidic_pKa,Basic_pKa,Caco_2_c,Caco_2,"
    "HIA,MDCK,F50,F30,F20,BBB,OATP1B1_inhibitor,OATP1B3_inhibitor,OATP2B1_inhibitor,"
    "OCT1_inhibitor,OCT2_inhibitor,BCRP_inhibitor,BSEP_inhibitor,MATE1_inhibitor,"
    "Pgp_inhibitor,Pgp_substrate,PPB,VDss,CYP1A2_inhibitor,CYP3A4_inhibitor,CYP2B6_inhibitor,"
    "CYP2C9_inhibitor,CYP2C19_inhibitor,CYP2D6_inhibitor,CYP1A2_substrate,CYP3A4_substrate,"
    "CYP2B6_substrate,CYP2C9_substrate,CYP2C19_substrate,CYP2D6_substrate,HLM,RLM,"
    "UGT_substrate,CLp_c,CLr,T50,MRT,Neurotoxicity,DILI,hERG_1uM,hERG_10uM,hERG_30uM,"
    "hERG_1_10uM,hERG_10_30uM,Respiratory_toxicity,Nephrotoxicity,Eye_corrosion,Eye_irritation,"
    "Skin_corrosion,Skin_irritation,Skin_sensitisation,ADT,Ames,Mouse_carcinogenicity_c,"
    "Mouse_carcinogenicity,Rat_carcinogenicity_c,Rat_carcinogenicity,Rodents_carcinogenicity,"
    "Micronucleus,Reproductive_toxicity,Mitochondrial_toxicity,Hemolytic_toxicity,"
    "Repeated_dose_toxicity,AOT_c,AOT,FDAMDD_c,FDAMDD,AR,ER,AR_LBD,ER_LBD,Aromatase,AhR,"
    "ARE,ATAD5,HSE,p53,PPAR,MMP,TR,GR,subcapitata_toxicity,Crustaceans_toxicity,magna_toxicity,"
    "Fish_toxicity,Fathead_minnow_toxicity,Bluegill_sunfish_toxicity,Rainbow_trout_toxicity,"
    "Sheepshead_minnow_toxicity,pyriformis_toxicity_c,pyriformis_toxicity,Honey_bee_toxicity,"
    "Colinus_virginanus_toxicity,Anas_platyrhynchos_toxicity,BCF_c,BCF,Biodegradability,"
    "Photoinduced_toxicity,Phototoxicity_Photoirritation,Photoallergy"
)
columns = [c.strip() for c in COLUMNS_STR.split(",")]

# Strictly text-based columns (not numerically indexed)
TEXT_COLUMNS = {"MolPort_ID", "SMILES", "Lipinski_rule", "Pfizer_rule", "GSK_rule"}

def run_import():
    start_time = time.time()
    
    if os.path.exists(DB_PATH):
        os.remove(DB_PATH)
        print("Old database removed for clean recreation.")

    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()

    # Performance settings for bulk insertion
    cursor.execute("PRAGMA synchronous = OFF;")
    cursor.execute("PRAGMA journal_mode = MEMORY;")
    cursor.execute("PRAGMA cache_size = -2000000;")  # ~2GB RAM cache

    # Create table structure with properly defined data types
    col_defs = []
    for col in columns:
        if col in TEXT_COLUMNS:
            col_defs.append(f'"{col}" TEXT')
        else:
            col_defs.append(f'"{col}" REAL')
            
    columns_definition = ", ".join(col_defs)
    create_table_sql = f'CREATE TABLE molecules ({columns_definition});'
    cursor.execute(create_table_sql)
    
    csv_files = glob.glob(CSV_PATTERN)
    print(f"{len(csv_files)} CSV files detected.")

    placeholders = ", ".join(["?"] * len(columns))
    insert_sql = f'INSERT INTO molecules ({", ".join([f\'"{c}"\' for c in columns])}) VALUES ({placeholders})'

    total_rows = 0
    block_size = 50000
    buffer = []

    # Read and insert data
    for file in csv_files:
        print(f"Reading: {os.path.basename(file)}...")
        
        with open(file, mode='r', encoding='utf-8', errors='ignore') as f:
            reader = csv.reader(f, delimiter=',')
            header = next(reader, None)
            if not header:
                continue
            
            for row in reader:
                if not row:
                    continue
                
                if len(row) < len(columns):
                    row += [""] * (len(columns) - len(row))
                elif len(row) > len(columns):
                    row = row[:len(columns)]
                
                processed_row = []
                for val, col_name in zip(row, columns):
                    val_stripped = val.strip()
                    if col_name in TEXT_COLUMNS:
                        processed_row.append(val_stripped)
                    else:
                        if val_stripped == "" or val_stripped.lower() == "nan":
                            processed_row.append(None)
                        else:
                            try:
                                processed_row.append(float(val_stripped))
                            except ValueError:
                                processed_row.append(None)

                buffer.append(processed_row)
                total_rows += 1

                if len(buffer) >= block_size:
                    cursor.executemany(insert_sql, buffer)
                    conn.commit()
                    buffer = []
                    print(f" -> {total_rows} records processed...")

    if buffer:
        cursor.executemany(insert_sql, buffer)
        conn.commit()

    print(f"\nInsertion completed. Total loaded records: {total_rows}")

    # =========================================================================
    # FULL AND AUTOMATIC INDEXING
    # =========================================================================
    print("\nCreating instant search indexes for ALL fields...")
    
    # 1. Mandatory text indexes
    print(" - Indexing MolPort_ID...")
    cursor.execute('CREATE INDEX idx_molport_id ON molecules (MolPort_ID);')
    print(" - Indexing SMILES...")
    cursor.execute('CREATE INDEX idx_smiles ON molecules (SMILES);')
    
    # 2. Dynamic loop to automatically index EVERY numeric field
    index_counter = 0
    for col in columns:
        if col not in TEXT_COLUMNS:
            print(f" - Bulk indexing [{index_counter + 1}]: {col}...")
            cursor.execute(f'CREATE INDEX idx_{col.lower()} ON molecules ({col});')
            index_counter += 1

    print("\nSaving indexes to disk...")
    conn.commit()
    conn.close()
    
    end_time = time.time()
    print("\n" + "="*50)
    print(f"FULLY INDEXED DATABASE CREATED IN {round(end_time - start_time, 2)} SECONDS!")
    print(f"Accelerated numeric fields: {index_counter}")
    print(f"Path: {DB_PATH}")
    print("="*50)

if __name__ == "__main__":
    run_import()