To run this script, you need to download the following comma-separated text files compressed with predictions of all parameters calculated by admetSAR 3.0 for the 2.2 million compounds in the MolPort library:
MPv8-noQ-01_admetSAR3.zip Size: 481 MB
MPv8-noQ-02_admetSAR3.zip Size: 483 MB
MPv8-noQ-03_admetSAR3.zip Size: 275 MB
MPv8-Quiral-01_admetSAR3.zip Size: 472 MB
MPv8-Quiral-02_admetSAR3.zip Size: 449 MB
import os
import glob
import csv
import sqlite3
import time
# Path configuration according to your environment
DIRECTORY = r"E:\admetSAR-dataset-csv-files\" # Change this to your actual path
DB_PATH = os.path.join(DIRECTORY, "base-MolPort.db")
CSV_PATTERN = os.path.join(DIRECTORY, "*.csv")
# Exact list of the 120 provided columns
COLUMNS_STR = (
"MolPort_ID,SMILES,MW,nAtom,nHet,nRing,nRot,HBA,HBD,TPSA,SlogP,logS,QED,"
"Lipinski_rule,Pfizer_rule,GSK_rule,logP,pKa,Acidic_pKa,Basic_pKa,Caco_2_c,Caco_2,"
"HIA,MDCK,F50,F30,F20,BBB,OATP1B1_inhibitor,OATP1B3_inhibitor,OATP2B1_inhibitor,"
"OCT1_inhibitor,OCT2_inhibitor,BCRP_inhibitor,BSEP_inhibitor,MATE1_inhibitor,"
"Pgp_inhibitor,Pgp_substrate,PPB,VDss,CYP1A2_inhibitor,CYP3A4_inhibitor,CYP2B6_inhibitor,"
"CYP2C9_inhibitor,CYP2C19_inhibitor,CYP2D6_inhibitor,CYP1A2_substrate,CYP3A4_substrate,"
"CYP2B6_substrate,CYP2C9_substrate,CYP2C19_substrate,CYP2D6_substrate,HLM,RLM,"
"UGT_substrate,CLp_c,CLr,T50,MRT,Neurotoxicity,DILI,hERG_1uM,hERG_10uM,hERG_30uM,"
"hERG_1_10uM,hERG_10_30uM,Respiratory_toxicity,Nephrotoxicity,Eye_corrosion,Eye_irritation,"
"Skin_corrosion,Skin_irritation,Skin_sensitisation,ADT,Ames,Mouse_carcinogenicity_c,"
"Mouse_carcinogenicity,Rat_carcinogenicity_c,Rat_carcinogenicity,Rodents_carcinogenicity,"
"Micronucleus,Reproductive_toxicity,Mitochondrial_toxicity,Hemolytic_toxicity,"
"Repeated_dose_toxicity,AOT_c,AOT,FDAMDD_c,FDAMDD,AR,ER,AR_LBD,ER_LBD,Aromatase,AhR,"
"ARE,ATAD5,HSE,p53,PPAR,MMP,TR,GR,subcapitata_toxicity,Crustaceans_toxicity,magna_toxicity,"
"Fish_toxicity,Fathead_minnow_toxicity,Bluegill_sunfish_toxicity,Rainbow_trout_toxicity,"
"Sheepshead_minnow_toxicity,pyriformis_toxicity_c,pyriformis_toxicity,Honey_bee_toxicity,"
"Colinus_virginanus_toxicity,Anas_platyrhynchos_toxicity,BCF_c,BCF,Biodegradability,"
"Photoinduced_toxicity,Phototoxicity_Photoirritation,Photoallergy"
)
columns = [c.strip() for c in COLUMNS_STR.split(",")]
# Strictly text-based columns (not numerically indexed)
TEXT_COLUMNS = {"MolPort_ID", "SMILES", "Lipinski_rule", "Pfizer_rule", "GSK_rule"}
def run_import():
start_time = time.time()
if os.path.exists(DB_PATH):
os.remove(DB_PATH)
print("Old database removed for clean recreation.")
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# Performance settings for bulk insertion
cursor.execute("PRAGMA synchronous = OFF;")
cursor.execute("PRAGMA journal_mode = MEMORY;")
cursor.execute("PRAGMA cache_size = -2000000;") # ~2GB RAM cache
# Create table structure with properly defined data types
col_defs = []
for col in columns:
if col in TEXT_COLUMNS:
col_defs.append(f'"{col}" TEXT')
else:
col_defs.append(f'"{col}" REAL')
columns_definition = ", ".join(col_defs)
create_table_sql = f'CREATE TABLE molecules ({columns_definition});'
cursor.execute(create_table_sql)
csv_files = glob.glob(CSV_PATTERN)
print(f"{len(csv_files)} CSV files detected.")
placeholders = ", ".join(["?"] * len(columns))
insert_sql = f'INSERT INTO molecules ({", ".join([f\'"{c}"\' for c in columns])}) VALUES ({placeholders})'
total_rows = 0
block_size = 50000
buffer = []
# Read and insert data
for file in csv_files:
print(f"Reading: {os.path.basename(file)}...")
with open(file, mode='r', encoding='utf-8', errors='ignore') as f:
reader = csv.reader(f, delimiter=',')
header = next(reader, None)
if not header:
continue
for row in reader:
if not row:
continue
if len(row) < len(columns):
row += [""] * (len(columns) - len(row))
elif len(row) > len(columns):
row = row[:len(columns)]
processed_row = []
for val, col_name in zip(row, columns):
val_stripped = val.strip()
if col_name in TEXT_COLUMNS:
processed_row.append(val_stripped)
else:
if val_stripped == "" or val_stripped.lower() == "nan":
processed_row.append(None)
else:
try:
processed_row.append(float(val_stripped))
except ValueError:
processed_row.append(None)
buffer.append(processed_row)
total_rows += 1
if len(buffer) >= block_size:
cursor.executemany(insert_sql, buffer)
conn.commit()
buffer = []
print(f" -> {total_rows} records processed...")
if buffer:
cursor.executemany(insert_sql, buffer)
conn.commit()
print(f"\nInsertion completed. Total loaded records: {total_rows}")
# =========================================================================
# FULL AND AUTOMATIC INDEXING
# =========================================================================
print("\nCreating instant search indexes for ALL fields...")
# 1. Mandatory text indexes
print(" - Indexing MolPort_ID...")
cursor.execute('CREATE INDEX idx_molport_id ON molecules (MolPort_ID);')
print(" - Indexing SMILES...")
cursor.execute('CREATE INDEX idx_smiles ON molecules (SMILES);')
# 2. Dynamic loop to automatically index EVERY numeric field
index_counter = 0
for col in columns:
if col not in TEXT_COLUMNS:
print(f" - Bulk indexing [{index_counter + 1}]: {col}...")
cursor.execute(f'CREATE INDEX idx_{col.lower()} ON molecules ({col});')
index_counter += 1
print("\nSaving indexes to disk...")
conn.commit()
conn.close()
end_time = time.time()
print("\n" + "="*50)
print(f"FULLY INDEXED DATABASE CREATED IN {round(end_time - start_time, 2)} SECONDS!")
print(f"Accelerated numeric fields: {index_counter}")
print(f"Path: {DB_PATH}")
print("="*50)
if __name__ == "__main__":
run_import()