Molecular Weight (MW) Distribution Dashboard

DrugBank database
MolPort database
Python script number 2 to build the frequency distribution graph of the MW parameter on DrugBank molecules.
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
import numpy as np
from scipy.stats import norm

# =============================================================================
# 1. STANDARD FORMAT FOR THE INPUT
# =============================================================================
datos_crudos = """Bin Center    % Frequency
0   0.04
20  0.69
40  0.61
60  2.10
80  0.89
100 1.25
120 1.58
140 2.83
160 2.87
180 3.31
200 3.11
220 3.72
240 4.85
260 5.01
280 6.06
300 6.63
320 5.66
340 4.73
360 5.05
380 5.05
400 4.65
420 4.93
440 3.44
460 3.31
480 2.75
500 2.10
520 1.78
540 1.50
560 1.66
580 1.66
600 1.01
620 1.13
640 0.65
660 0.57
680 0.49
700 0.32
720 0.24
740 0.49
760 0.36
780 0.57
800 0.36"""

lineas = datos_crudos.strip().split('\n')[1:] 
bins_array = []
freq_array = []
for linea in lineas:
    b, f = linea.strip().split() 
    bins_array.append(float(b))
    freq_array.append(float(f))

bin_centers = np.array(bins_array)
frequencies = np.array(freq_array)
bins = bin_centers # Just in case
freq = frequencies # Just in case

# 2. Gaussian Fit Parameters
amplitude = 5.536
mean = 322.3
sd = 141.2

# Generate smooth X data for the curve
x_smooth = np.linspace(0, 850, 400)

# Calculate Y using the Gaussian equation
y_smooth = amplitude * np.exp(-0.5 * ((x_smooth - mean) / sd)**2)

# 3. Define colors (MW Traffic light system)
colors = []
for x in bin_centers:
    # Optimal Range: 200 to 500 (Classic Lipinski's Rule)
    if 200 <= x <= 500:
        colors.append('green')
    # Caution Range: 100-200 (Fragments) or 500-600 (Upper limit)
    elif (100 <= x < 200) or (500 < x <= 600):
        colors.append('gold')
    # Risk Range: < 100 (Too small) or > 600 (Too large/Non-oral)
    else:
        colors.append('firebrick')

# 4. Create the plot
plt.figure(figsize=(7, 6)) # Wide plot to clearly see the bins

# A. Draw Bars
# Adjust the bar width to 16 to leave a small gap between them (since the step is 20)
plt.bar(bin_centers, frequencies, width=16, color=colors, edgecolor='black', alpha=0.7, label='Data Frequency')

# B. Draw Trend Line
plt.plot(x_smooth, y_smooth, color='darkorange', linewidth=2.5, label='Gaussian Fit')

# 5. Labels and Titles
plt.xlabel('Molecular Weight (Da)', fontsize=12)
plt.ylabel('% Frequency', fontsize=12)
plt.title('Molecular Weight Distribution for DrugBank', fontsize=14)

# Adjust X axis
plt.xticks(np.arange(0, 850, 50), rotation=45, fontsize=9)
plt.xlim(-20, 850)

# 6. Custom Legend
legend_elements = [
    Line2D([0], [0], color='darkorange', lw=2.5, label=f'Fit (Mean={mean}, SD={sd})'),
    Patch(facecolor='green', edgecolor='black', alpha=0.7, label='Optimal (200 - 500 Da)'),
    Patch(facecolor='gold', edgecolor='black', alpha=0.7, label='Caution (100-200, 500-600 Da)'),
    Patch(facecolor='firebrick', edgecolor='black', alpha=0.7, label='Risk / Non-Oral (<100, >600 Da)')
]

plt.legend(handles=legend_elements, loc='upper right')
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()

plt.show()