tutorial_archetypes_prototypes_siqd_ensembles.r michael Sat Oct 29 21:38:

Electronic Supplementary Material (ESI) for Nanoscale. This journal is The Royal Society of Chemistry 2016 tutorial_archetypes_prototypes_siqd_ensembles.r michael Sat Oct 29 21:38:31 2016 #R script to identify and analyse archetypes and prototypes #in SiQD ensemble with Boltzmann distribution #Clear any variable from the workspace rm(list = setdiff(ls(), lsf.str())) options(warn=-1) #Load/install all the required libraries for (mypkg in c("archetypes","gplots","scatterplot3d","stringr", "RColorBrewer","calibrate","rgl")){ if (!is.element(mypkg, installed.packages()[,1])){ install.packages(mypkg) lapply(mypkg, require, character.only=true) Loading required package: archetypes Loading required package: modeltools Loading required package: stats4 Loading required package: nnls Loading required package: gplots Attaching package: 'gplots' The following object is masked from 'package:stats': lowess Loading required package: scatterplot3d Loading required package: stringr Loading required package: RColorBrewer Loading required package: calibrate Loading required package: MASS Loading required package: rgl 1

#Set random seed for reproducibility set.seed(1234) #Function to find the closest structures to the cluster centroids #or archetypes parameters getsimilartovector <- function(archetype,examples_coord,number){ diff <- matrix(nrow=nrow(archetype),ncol=nrow(examples_coord)) arch_close <- matrix(nrow=nrow(archetype),ncol=number) for (i in 1:nrow(archetype)){ for (j in 1:nrow(examples_coord)){ diff[i,j] <- sqrt(sum((examples_coord[j,]- archetype[i,])^2)) arch_close[i,] <- order(diff[i,])[1:number] return(arch_close) #Define plots margin parameters par(mar =c(4, 4, 1, 1)) #Load the dataset from a comma separated text file folder <- "/home/michael/dropbox/nanoinformatics_dev/papers/silicon_aa/" data_file <- paste(folder,"siqds_boltzmann.csv",sep="") datasource <- read.delim(data_file, header = TRUE, sep = ",") #Define coloring scheme for plots from a column in the dataset, #ie. nanoparticule shape color_id <- which(names(datasource)=="shape.index..for.convenience.") #Define the column features to be used in the analysis features <- seq(4,11) #Define the structures to use in the analysis (all in this case) data_select_id <-seq(nrow(datasource)) #Select the data to run the analysis data <- datasource[data_select_id, features] #Scale the data to zero mean and unit standard deviation data_scaled <- scale(data) #Perform PCA analysis pca <- prcomp(data,scale = T) Compute the explained variance for different number of principal components pc <- cumsum(pca$sdev^2 / sum(pca$sdev^2)) #Search archetypes ranging from 1 to 15 archetypes result_archetype <- steparchetypes(data_scaled, k=1:15, nrep=5) *** k=1, rep=1: 2

*** k=1, rep=2: *** k=1, rep=3: *** k=1, rep=4: *** k=1, rep=5: *** k=2, rep=1: *** k=2, rep=2: *** k=2, rep=3: *** k=2, rep=4: *** k=2, rep=5: *** k=3, rep=1: *** k=3, rep=2: *** k=3, rep=3: *** k=3, rep=4: *** k=3, rep=5: *** k=4, rep=1: *** k=4, rep=2: *** k=4, rep=3: *** k=4, rep=4: *** k=4, rep=5: *** k=5, rep=1: *** k=5, rep=2: *** k=5, rep=3: *** k=5, rep=4: *** k=5, rep=5: *** k=6, rep=1: *** k=6, rep=2: *** k=6, rep=3: 3

*** k=6, rep=4: *** k=6, rep=5: *** k=7, rep=1: *** k=7, rep=2: *** k=7, rep=3: *** k=7, rep=4: *** k=7, rep=5: *** k=8, rep=1: *** k=8, rep=2: *** k=8, rep=3: *** k=8, rep=4: *** k=8, rep=5: *** k=9, rep=1: *** k=9, rep=2: *** k=9, rep=3: *** k=9, rep=4: *** k=9, rep=5: *** k=10, rep=1: *** k=10, rep=2: *** k=10, rep=3: *** k=10, rep=4: *** k=10, rep=5: *** k=11, rep=1: *** k=11, rep=2: *** k=11, rep=3: *** k=11, rep=4: *** k=11, rep=5: 4

*** k=12, rep=1: *** k=12, rep=2: *** k=12, rep=3: *** k=12, rep=4: *** k=12, rep=5: *** k=13, rep=1: *** k=13, rep=2: *** k=13, rep=3: *** k=13, rep=4: *** k=13, rep=5: *** k=14, rep=1: *** k=14, rep=2: *** k=14, rep=3: *** k=14, rep=4: *** k=14, rep=5: *** k=15, rep=1: *** k=15, rep=2: *** k=15, rep=3: *** k=15, rep=4: *** k=15, rep=5: #Select the optimum number of principal components, cluster and archetypes #Compute the explained variance for different number of archetypes EV = array() for (i in 1:15){ aa_temp <-bestmodel(result_archetype[[i]]) XCs <- aa_temp$alphas %*% aa_temp$archetypes EV[i] = mean((rowsums(data_scaled**2) - rowsums((data_scaled-xcs)**2))/ rowsums(data_scaled**2)) #Compute the explained variance for different number of clusters clusters_ss <- array() for (i in 1:15){ clusters_tmp <- kmeans(data_scaled,i) 5

clusters_ss[i] <- clusters_tmp$betweenss/clusters_tmp$totss #Plot the amount of explain variance vs. number of components plot(clusters_ss,ylim=c(-0.040,1.04),xlim=c(0,17),cex = 0.8,cex.axis = 1.2, fg = "black", col = "white", xlab="number of components", ylab="explained variance", cex.lab = 1.2,pin=c(7,5)) lines(clusters_ss,col="black") points(clusters_ss,pch = 23,cex = 1.6, fg = "black",bg = "white") lines(ev,col="black") points(ev,pch=21,cex = 1.6, fg = "black",bg = "white") points(pc,pch=25,cex = 1.6, fg = "black",bg = "white") lines(pc,col="black") Explained variance 0.0 0.2 0.4 0.6 0.8 1.0 0 5 10 15 Number of components #Set the optimum number of clusters and archetypes according to the #elbow criteria in the plots number_cluster <- 8 number_archetype <- 8 #Compute the archetype model aa <- bestmodel(result_archetype[[number_archetype]]) #Plot the dataset as simplex plots of the archetypes pointcolor <- datasource[,color_id] #Coloring the points according a column value #Compute the sample explained variance 6

XCs <- aa$alphas %*% aa$archetypes ESV = (rowsums(data_scaled**2) - rowsums((data_scaled-xcs)**2))/rowsums(data_scaled**2) barplot(esv,ylim=c(0,1.1),space=10) 0.0 0.2 0.4 0.6 0.8 1.0 1 15 31 47 63 79 95 113 133 153 173 193 213 233 #Compute the projection of the data in a simpleplot simplexplot(aa,points_col=pointcolor) A6 A7 A5 A8 A4 A1 A3 A2 7

sp <- simplex_projection(aa$archetypes) #Find the closest structure to the simplex nodes (archetypes) archetypal_structures <- getsimilartovector(sp,aa$alphas%*%sp,1) #Compute the clusters clusters <- kmeans(data_scaled,number_cluster) #Find the closest structure to the cluster centroids (prototypes) clusters_structures <- getsimilartovector(clusters$centers,data_scaled,1) #Build a dendrogram to compare the archetypes hc_structure <- hclust(dist(data_scaled[archetypal_structures,])) plot(hc_structure) Cluster Dendrogram 4 158 Height 2 4 6 8 10 7 18 44 224 64 13 dist(data_scaled[archetypal_structures, ]) #Build a dendrogram to compare the prototypes hc_structure_clusters <- hclust(dist(data_scaled[clusters_structures,])) plot(hc_structure_clusters) 8

Cluster Dendrogram Height 2 3 4 5 6 77 38 96 85 130 174 164 212 dist(data_scaled[clusters_structures, ]) #Build a heapmat of the crontribution of the achetypes to each sample #Define plots margin parameters. #In case of "Error in plot.new() : figure margins too large" resize the plot windows #in rstudio to be the full left panel. par(mar =c(1, 1, 1, 1)) heatmap.2(aa$alphas[order(datasource[data_select_id,color_id]),],trace = "none", Rowv = F,density.info="none",dendrogram = "column", Colv=as.dendrogram(hclust(dist(data_scaled[archetypal_Structures,]))), col=brewer.pal(7,"rdbu")) 9

Color Key 0 0.4 0.8 Value 242 241 240 239 238 237 236 235 234 233 232 231 230 229 228 227 226 225 224 223 222 221 220 219 218 217 216 215 214 213 212 211 210 209 208 207 206 205 204 203 202 201 200 199 198 197 196 195 194 193 192 191 190 189 188 187 186 185 184 183 182 181 180 179 178 177 176 175 174 173 172 171 170 169 168 167 166 165 164 163 162 161 160 159 158 157 156 155 154 153 152 151 150 149 148 147 146 145 144 143 142 141 140 139 138 137 136 135 134 133 132 131 130 129 128 127 126 125 124 123 122 121 120 119 118 117 116 115 114 113 112 111 110 109 108 107 106 105 104 103 102 101 100 99 98 97 96 95 94 93 92 91 90 89 88 87 86 85 84 83 82 81 80 79 78 77 76 75 74 73 72 71 70 69 68 67 66 65 64 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 987654321 1 6 2 4 7 8 3 5 #Build 3D scatter plots of the Principal Component Analysis (PCA) #Archetype structures are bigger spheres and prototypes are shadowed spheres library(rgl) rgl.open() rgl.bg( sphere = FALSE, fogtype = "none", color=c("white","black"), back="lines") for (i in 1:nrow(pca$x)) {rgl.spheres(pca$x[i,1],pca$x[i,2],pca$x[i,3],0.1, color=c(as.numeric(datasource[data_select_id[i], color_id]))) rgl.spheres(pca$x[archetypal_structures,1],pca$x[archetypal_structures,2], pca$x[archetypal_structures,3],0.5, color=c(as.numeric(datasource[data_select_id[archetypal_structures], color_id]))) rgl.spheres(pca$x[clusters_structures,1],pca$x[clusters_structures,2], pca$x[clusters_structures,3],0.5, color=c(as.numeric(datasource[data_select_id[clusters_structures], color_id])),alpha=0.5) rgl.lines(c(min(pca$x[,1])-1,max(pca$x[,1])+1),c(max(pca$x[,2])+1,max(pca$x[,2])+1), c(min(pca$x[,3])-1,min(pca$x[,3])-1),color=c("black")) rgl.lines(c(min(pca$x[,1])-1,min(pca$x[,1])-1),c(min(pca$x[,2])-1,max(pca$x[,2])+1), c(min(pca$x[,3])-1,min(pca$x[,3])-1),color=c("black")) rgl.lines(c(min(pca$x[,1])-1,min(pca$x[,1])-1),c(max(pca$x[,2])+1,max(pca$x[,2])+1), c(min(pca$x[,3])-1,max(pca$x[,3])+1),color=c("black")) rgl.viewpoint(0,-90,zoom=0.7) rgl.texts(max(pca$x[,1])+1, max(pca$x[,2])+1, min(pca$x[,3])-2, c("pc1"),cex=1.5, color=c("black"),family=c("sans")) rgl.texts(min(pca$x[,1])-1, min(pca$x[,2])+1, min(pca$x[,3])-2, c("pc2"),cex=1.5, color=c("black"),family=c("sans")) rgl.texts(min(pca$x[,1])-1, max(pca$x[,2])+1, max(pca$x[,3])+2, c("pc3"),cex=1.5, 10

color=c("black"),family=c("sans")) #Build simplex plots of the archetypes model #Archetype structures are bigger spheres and prototypes are shadowed spheres angle = -(90/180)*3.14 M <- matrix(c(cos(angle),-sin(angle),sin(angle),cos(angle)),2,2) rgl.open() rgl.bg(sphere = FALSE, fogtype = "none", color=c("white","black"), back="lines") for (i in 1:length(archetypal_Structures)-1){ rgl.lines(c((sp %*% M)[i,1],(sp %*% M)[i+1,1]),c((sp %*% M)[i,2], (sp %*% M)[i+1,2]), c(0,0),color=c("black")) rgl.lines(c((sp %*% M)[length(archetypal_Structures),1],(sp %*% M)[1,1]), c((sp %*% M)[length(archetypal_Structures),2],(sp %*% M)[1,2]), c(0,0),color=c("black")) rgl.spheres((aa$alphas%*%sp%*% M)[,1], (aa$alphas%*%sp%*% M)[,2],r = 0.3, rep(0,nrow(data)),color = as.numeric(datasource[data_select_id,color_id])) rgl.spheres((aa$alphas%*%sp%*% M)[archetypal_Structures,1], (aa$alphas%*%sp%*% M)[archetypal_Structures,2],r = 0.7,rep(0,nrow(data)), color = as.numeric(datasource[data_select_id,color_id][archetypal_structures])) rgl.spheres((aa$alphas%*%sp%*% M)[clusters_Structures,1], (aa$alphas%*%sp%*% M)[clusters_Structures,2],r = 0.7,alpha=0.3, rep(0,nrow(data)), color = as.numeric(datasource[data_select_id,color_id][archetypal_structures])) rgl.spheres(seq(19,19 + length(unique(datasource[data_select_id,color_id]))*(-0.),-0.), seq(1,1 + (length(unique(datasource[data_select_id,color_id]))-1)*0.6,0.6), rep(0,length(unique(datasource[data_select_id,color_id]))),r=0.3, color=c(as.numeric(unique(datasource[data_select_id,color_id])) )) rgl.texts(seq(22.5,22.5 + length(unique(datasource[data_select_id,color_id]))*(-0.),-0.), seq(1.05,1.05 + (length(unique(datasource[data_select_id,color_id]))-1)*0.6,0.6), rep(0,length(unique(datasource[data_select_id,color_id]))), unique(datasource[data_select_id,color_id]),cex=1.5,color=c("black"),family=c("sans")) rgl.viewpoint(0,0,zoom=0.7) 11