import numpy as np
import numpy.ma as ma
%matplotlib nbagg
import matplotlib as mpl
import matplotlib.pyplot as plt
ID,RELEASE_DATE,REVISION_DATE,RESOLUTION,RESNAME,NUMBER_ATOMS_NH,N_ATOMS_EDS, RSR,RSCC,LIG_RSRZ,MOGUL_BONDS_RMSZ,MOGUL_RMSZ_NUMBONDS,MOGUL_ANGLES_RMSZ,MOGUL_RMSZ_NUMANGLES
cvs_file = 'ligand_stats_dump.csv'
# select just the data we want to read
data = np.genfromtxt(cvs_file, dtype=None, delimiter=',', names=True,
usecols=('RELEASE_DATE',
'NUMBER_ATOMS_NH',
'MOGUL_BONDS_RMSZ',
'MOGUL_RMSZ_NUMBONDS',
'MOGUL_ANGLES_RMSZ',
'MOGUL_RMSZ_NUMANGLES'
))
plot_number = 0
def next_plot_number():
# method to increment internal plot number for the different graphs
global plot_number
plot_number += 1
return plot_number
def mogul_vs_year_box_plot(size=None, bond=True):
"""
plots a box plot Mogul rmsZ bond or angle vs year
Args:
size (str): one of 's', 'm' or 'l'
bond (bool): True means bond, False means angle
"""
if bond:
column_mogul_rmsz_num = 'MOGUL_RMSZ_NUMBONDS'
column_data = 'MOGUL_BONDS_RMSZ'
ylabel = 'Mogul bond length RMSZ'
png_out = 'figure_1_bond'
else:
column_mogul_rmsz_num = 'MOGUL_RMSZ_NUMANGLES'
column_data = 'MOGUL_ANGLES_RMSZ'
ylabel = 'Mogul angle RMSZ'
png_out = 'figure_s1_angle'
if size == 's':
title='Ligands with 6 to 10 non-hydrogen atoms'
size_select = np.logical_and( data['NUMBER_ATOMS_NH'] > 5, data['NUMBER_ATOMS_NH'] < 11)
png_out += '_a_small.png'
elif size == 'm':
title='Ligands with 11 to 20 non-hydrogen atoms'
size_select = np.logical_and( data['NUMBER_ATOMS_NH'] > 10, data['NUMBER_ATOMS_NH'] < 21)
png_out += '_b_medium.png'
elif size == 'l':
title='Ligands with more than 20 non-hydrogen atoms'
size_select = data['NUMBER_ATOMS_NH'] > 20
png_out += '_c_large.png'
else:
raise RuntimeError('unrecognized size')
enough_data = data[column_mogul_rmsz_num] > 0
selection = np.logical_and(size_select, enough_data)
selected_data = data[selection]
what_to_plot = column_data
data_by_year = []
xaxis_labels = []
for year in range(1994, 2018):
releases = selected_data[selected_data['RELEASE_DATE'] == year]
this_years_data = releases[what_to_plot]
data_by_year.append(this_years_data)
if year%5 == 0:
xaxis_labels.append(str(year))
else:
xaxis_labels.append('')
# bigger axis labels
font_size = 13
mpl.rcParams['xtick.labelsize'] = font_size
mpl.rcParams['ytick.labelsize'] = font_size
mpl.rcParams['axes.labelsize'] = font_size
fig = plt.figure(next_plot_number())
plt.title(title)
# Create an axes instance
ax = fig.add_subplot(111)
# marker for mean https://matplotlib.org/examples/statistics/boxplot_demo.html
meanpointprops = dict(marker='o', markeredgecolor='black', markerfacecolor='red')
medianprops = dict(linestyle='-.', linewidth=2.0, color='black')
nbox = len(data_by_year)
# Create the boxplot
bp = ax.boxplot(data_by_year, meanprops=meanpointprops, medianprops=medianprops,
showfliers=False, whis=[10, 90])
ax.set_xticklabels(xaxis_labels)
axes = plt.gca()
ax.yaxis.set_ticks(np.arange(0., 1.8, 0.2))
ax.set_ylabel(ylabel)
ax.set_ylim([-0.1,3.5])
ax.set_yticks([0.,1.0,2.0,3.0])
# dotted line at 1.0
ax.plot([0,nbox+1],[1,1], ':', color='gray')
ax.plot([0,nbox+1],[0,0], ':', color='gray')
ax.set_xlabel('year of release')
fig.savefig(png_out, dpi=800, bbox_inches = 'tight', pad_inches = 0.05)
# first bond graphs for figure 1.
bond=True
for s in ('s', 'm', 'l'):
mogul_vs_year_box_plot( size=s, bond=bond)
# Angle graphs for figure S1.
bond = False
for s in ('s', 'm', 'l'):
mogul_vs_year_box_plot( size=s, bond=bond)
# select just the data we want to read
data = np.genfromtxt(cvs_file, dtype=None, delimiter=',', names=True,
usecols=('RSCC',
'LIG_RSRZ',
'RESOLUTION',
'RSR'
))
print('Have read data for {} individual PDB ligands'.format(data.shape[0]))
Have data for ~788 000 ligands.
The column LIG_RSRZ
is actually LLDF
so rename it:
data.dtype.names = ('RSCC', 'LLDF', 'RESOLUTION', 'RSR')
data.dtype.names
data[0:20]
# take out nan data
valid_rscc = np.isfinite(data['RSCC'])
valid_rsr = np.isfinite(data['RSR'])
valid_lldf = np.isfinite(data['LLDF'])
valid = np.logical_and(valid_rscc, valid_lldf,valid_rsr)
data_valid = data[valid]
data_valid[0:20]
num_data_valid = data_valid.shape[0]
print('after taking out all nan have data for {} ligands '.format(num_data_valid))
num_tot = data.shape[0]
num_valid_rscc = data[valid_rscc].shape[0]
num_valid_rsr = data[valid_rsr].shape[0]
num_valid_lldf = data[valid_lldf].shape[0]
print('number of ligands with a RSCC value is {} that is {:.1f}% of total'.
format(num_valid_rscc, 100.*num_valid_rscc/num_tot))
print('number of ligands with a RSR value is {} that is {:.1f}% of total'.
format(num_valid_rsr, 100.*num_valid_rsr/num_tot))
print('number of ligands with a LLDF value is {} that is {:.1f}% of total'.
format(num_valid_lldf, 100.*num_valid_lldf/num_tot))
print('N.B. analysis is for ligands with LLDF, RSR and RSCC values that are not nan')
lldf_above2 = data_valid[data_valid['LLDF']>=2.0]
num_lldf_above2 = lldf_above2.shape[0]
lldf_below2 = data_valid[data_valid['LLDF']<2.0]
num_lldf_below2 = lldf_below2.shape[0]
rscc_above_0p95 = data_valid[data_valid['RSCC']>0.95]
num_rscc_above_0p95 = rscc_above_0p95.shape[0]
rscc_below_0p8 = data_valid[data_valid['RSCC']<=0.8]
num_rscc_below_0p8 = rscc_below_0p8.shape[0]
rscc_in_between = data_valid[(data_valid['RSCC']<=0.95) & (data_valid['RSCC']>0.8)]
num_rscc_in_between = rscc_in_between.shape[0]
print('number with LLDF>=2 is {} that is {:.1f}% of ligands with values'.
format(num_lldf_above2, 100.*num_lldf_above2/num_data_valid))
print('number with LLDF<2 is {} that is {:.1f}% of ligands with values'.
format(num_lldf_below2, 100.*num_lldf_below2/num_data_valid))
print()
print('number with RSCC<=0.8 is {} that is {:.1f}% of ligands with values'.
format(num_rscc_below_0p8, 100.*num_rscc_below_0p8/num_data_valid))
print('number with RSCC in range 0.8 to 0.95 is {} that is {:.1f}% of ligands with values'.
format(num_rscc_in_between, 100.*num_rscc_in_between/num_data_valid))
print('number with RSCC>0.95 is {} that is {:.1f}% of ligands with values'.
format(num_rscc_above_0p95, 100.*num_rscc_above_0p95/num_data_valid))
now look at LLDF values in the different ranges:
from collections import OrderedDict
rssc_ranges = OrderedDict()
rssc_ranges[' <=0.8'] = rscc_below_0p8
rssc_ranges['in range 0.8 to 0.95'] = rscc_in_between
rssc_ranges[' >0.95'] = rscc_above_0p95
for key, this_data in rssc_ranges.items():
print('\nRSCC {}:'.format(key))
this_lldf_above2 = this_data[this_data['LLDF']>=2.0]
num_this_lldf_above2 = this_lldf_above2.shape[0]
this_lldf_below2 = this_data[this_data['LLDF']<2.0]
num_this_lldf_below2 = this_lldf_below2.shape[0]
print('\t number with LLDF>=2 is {} that is {:.1f}% of ligands with values'.
format(num_this_lldf_above2, 100.*num_this_lldf_above2/num_data_valid))
print('\t number with LLDF<2 is {} that is {:.1f}% of ligands with values'.
format(num_this_lldf_below2, 100.*num_this_lldf_below2/num_data_valid))
def bin_data( this_data, x_col, y_col, bin_increment, bin_low, echo=False, skip=0):
"""
bin the data into bins and work out the xaxis labels for the graph
"""
data_in_bin = []
xaxis_labels = []
for ic in range(bin_low.size):
low = bin_low[ic]
high = low + bin_increment
select_low = np.greater(this_data[x_col], low)
select_high = np.less_equal(this_data[x_col], high)
in_bin = np.logical_and(select_low,select_high)
this_bin_data = this_data[in_bin]
this_bin_y = this_bin_data[y_col]
if ic%5 == skip:
this_label = '{:.1f}'.format(0.5*(low+high))
else:
this_label = ''
if echo:
print('{:.2f} to {:.2f} label="{}" size={} lower_quartile={:.2f} media={:.2f} upper_quartile={:.2f}'.
format( low, high, this_label, this_bin_y.size,
np.percentile(this_bin_y,25), np.median(this_bin_y), np.percentile(this_bin_y,75)))
data_in_bin.append(this_bin_y)
xaxis_labels.append(this_label)
return xaxis_labels, data_in_bin
def my_plot( fig, x_col, y_col, data_in_bin):
"""common bit of all boxplotes here - supplied with a plt.figure returns the ax for additional bits"""
# all plot bigger axis labels
font_size = 13
mpl.rcParams['xtick.labelsize'] = font_size
mpl.rcParams['ytick.labelsize'] = font_size
mpl.rcParams['axes.labelsize'] = font_size
# Create an axes instance
ax = fig.add_subplot(111)
# marker for mean https://matplotlib.org/examples/statistics/boxplot_demo.html
# meanpointprops = dict(marker='o', markeredgecolor='black', markerfacecolor='red')
medianprops = dict(linestyle='-.', linewidth=2.0, color='black')
nbox = len(data_in_bin)
# Create the boxplot
bp = ax.boxplot(data_in_bin, medianprops=medianprops, showfliers=False, whis=[10, 90])
ax.set_xticklabels(xaxis_labels)
axes = plt.gca()
# bigger axis labels
font_size = 13
mpl.rcParams['xtick.labelsize'] = font_size
mpl.rcParams['ytick.labelsize'] = font_size
mpl.rcParams['axes.labelsize'] = font_size
ax.set_xlabel(x_col)
ax.set_ylabel(y_col)
return ax
x_col = 'RSCC'
y_col = 'LLDF'
# want to bin data in RSCC range from 0.5 to 1.0
bin_increment = 0.02
bin_low = np.arange(0.49, 1.0, bin_increment)
xaxis_labels, data_in_bin = bin_data( data_valid, x_col, y_col, bin_increment, bin_low)
fig = plt.figure(next_plot_number())
ax = my_plot(fig, x_col, y_col, data_in_bin)
ax.set_ylim([-5,13]) # LLDF
ax.plot([0,bin_low.size+1],[2,2], ':', color='gray') # LLDF 2.0 dashed line
ax.plot([16,16],[-10,20], ':', color='gray') # RSCC 0.8 line
fig.savefig('figure_3_lldf_vs_rscc.png', dpi=800, bbox_inches = 'tight', pad_inches = 0.05)
x_col = 'RESOLUTION'
y_col = 'LLDF'
# Resolution bins from 1 to 4
bin_increment = 0.2
bin_low = np.arange(0.7, 4.0, bin_increment)
xaxis_labels, data_in_bin = bin_data( data_valid, x_col, y_col, bin_increment, bin_low, echo=True, skip=1)
fig = plt.figure(next_plot_number())
ax = my_plot(fig, x_col, y_col, data_in_bin)
ax.plot([0,bin_low.size+1],[2.0,2.0], ':', color='gray') # LLDF 2.0 dotted line
ax.set_xlabel('Resolution in Ångström')
fig.savefig('figure_5a_lldf_vs_resolution.png', dpi=800, bbox_inches = 'tight', pad_inches = 0.05)
x_col = 'RESOLUTION'
y_col = 'RSCC'
# Resolution bins from 1 to 4
bin_increment = 0.2
bin_low = np.arange(0.7, 4.0, bin_increment)
xaxis_labels, data_in_bin = bin_data( data_valid, x_col, y_col, bin_increment, bin_low, echo=True, skip=1)
fig = plt.figure(next_plot_number())
ax = my_plot(fig, x_col, y_col, data_in_bin)
ax.plot([0,bin_low.size+1],[0.8,0.8], ':', color='gray') # RSSC 0.8 dotted line
ax.set_xlabel('Resolution in Ångström')
fig.savefig('figure_5b_rscc_vs_resolution.png', dpi=800, bbox_inches = 'tight', pad_inches = 0.05)
x_col = 'RESOLUTION'
y_col = 'RSR'
bin_increment = 0.2
bin_low = np.arange(0.7, 4.0, bin_increment)
xaxis_labels, data_in_bin = bin_data( data_valid, x_col, y_col, bin_increment, bin_low, echo=False, skip=1)
fig = plt.figure(next_plot_number())
ax = my_plot(fig, x_col, y_col, data_in_bin)
ax.set_xlabel('Resolution in Ångström')
fig.savefig('figure_5c_rsr_vs_resolution.png', dpi=800, bbox_inches = 'tight', pad_inches = 0.05)
x_col = 'RSCC'
y_col = 'RSR'
# want to bin data in RSCC range from 0.5 to 1.0
bin_increment = 0.02
bin_low = np.arange(0.49, 1.0, bin_increment)
xaxis_labels, data_in_bin = bin_data( data_valid, x_col, y_col, bin_increment, bin_low)
# Create a figure instance
fig = plt.figure(next_plot_number())
ax = my_plot(fig, x_col, y_col, data_in_bin)
ax.set_ylim([0,0.6]) # RSR
ax.plot([16,16],[-10,20], ':', color='gray') # RSCC 0.8 line
fig.savefig('figure_s2_rsr_vs_rscc.png', dpi=800, bbox_inches = 'tight', pad_inches = 0.05)
from scipy.stats import spearmanr # use scip Spearman implementation
print("Spearman's rank correlation coefficients:")
for (pair_a, pair_b) in (('RSCC', 'LLDF'), ('RSCC', 'RSR'), ('LLDF', 'RSR')):
spearman_r = spearmanr(data_valid[pair_a],data_valid[pair_b])
print('{} {}\tcorrelation={:5.2f}\tp value={}'.format(pair_a, pair_b, spearman_r[0], spearman_r[1]))