Set up environment, global variables, load libraries
library(dplyr)
library(ape)
library(phangorn)
library(RColorBrewer)
library(phytools)
# XXX set working directory to script source first
REPO_HOME <- paste(getwd(), "/../", sep = "")
# names of domesticated taxa
dom.taxa <- c(
"Bos_frontalis_gaurus", "Bos_grunniens_mutus", "Bos_javanicus", "Bos_taurus_primigenius",
"Bubalus_bubalis_arnee", "Camelus_bactrianus", "Camelus_dromedarius", "Capra_hircus_aegagrus",
"Equus_przewalskii", "Lama_glama_guanicoe", "Ovis_aries_orientalis",
"Rangifer_tarandus", "Sus_scrofa", "Vicugna_vicugna"
)
Cluster by inverse niche overlap
# load the hclust trees based on Gower's D and Schoener's D
hclust.maxent.traits<-read.csv(paste(REPO_HOME, "/Results/maxent/comparative/gower/maxent_omi.csv", sep=""), header = TRUE, row.names = 1)
hclust.maxent.traits<- as.dist(hclust.maxent.traits)
hclust.maxent.traits<- hclust(hclust.maxent.traits)
hclust.raw.occurences<- read.csv(paste(REPO_HOME, "/Results/maxent/comparative/gower/raw_omi.csv", sep=""), header = TRUE, row.names = 1)
hclust.raw.occurences<- as.dist(hclust.raw.occurences)
hclust.raw.occurences<- hclust(hclust.raw.occurences)
hclust.maxent<- read.csv(paste(REPO_HOME, "/Results/maxent/comparative/schoener/overlap.csv", sep=""), header = TRUE, row.names = 1)
hclust.maxent<- as.dist(1-hclust.maxent)
hclust.maxent<- hclust(hclust.maxent)
# list tree files
tree.list<- list(hclust.maxent.traits, hclust.raw.occurences, hclust.maxent)
list.names<- list("clusters.Maxent.traits.gower" ,"clusters.raw.occurences.gower" ,"clusters.Maxent.Schoener")
Partition the clustering
# instantiate results vector and transform function
for (i in 1:length(list.names)) {
# load hclust tree
hclust.tree<- tree.list[[i]]
specificity <- vector(length = 31, mode = "numeric")
make_df <- function(partitions) {
partitions.df <- as.data.frame(partitions)
taxa.names <- names(partitions)
for ( j in 1:length(taxa.names) ) {
taxon.name <- taxa.names[j]
if ( taxon.name %in% dom.taxa ) {
partitions.df[j,2] <- T
} else {
partitions.df[j,2] <- F
}
}
names(partitions.df) <- c("cluster","domesticated")
return(partitions.df)
}
names.plots<- list.names[[i]]
# do the partition from one for all taxa to one for each taxon
for ( i in 1:31 ) {
partitions <- cutree( hclust.tree, k = i )
partitions.df <- make_df(partitions)
# count distinct clusters for domesticated taxa
nclusters <- as.vector(unique(dplyr::select(filter(partitions.df,domesticated), cluster)))
spec <- length(nclusters$cluster) / i
specificity[i] <- spec
}
# partition the tree to the optimal number of clusters
optimum.nclusters <- which.min(specificity)
partitions <- cutree( hclust.tree, k = optimum.nclusters )
partitions.df <- make_df(partitions)
partitionsdf.name<- sprintf("%s/results/maxent/%s.csv", REPO_HOME, names.plots)
write.csv(partitions.df, partitionsdf.name)
# consecutive plot commands seem to work better when wrapped inside curly braces
{
plot(specificity, type = "b", main= names.plots)
abline(v = optimum.nclusters)
}
}
Select niche traits of greatest magnitude
omi.file.maxent <- sprintf('%s/results/OMI/normalized_MaxEnt_values.csv',REPO_HOME)
omi.file.raw.occurrences<- sprintf('%s/results/OMI/normalized_raw_values.csv',REPO_HOME)
omi.file<- list(omi.file.maxent, omi.file.raw.occurrences)
omi.names<- list("magnitudes.Maxent.probeer.csv", "magnitudes.raw.probeer.csv")
part.df.maxent<- sprintf('%s/results/maxent/clusters.Maxent.traits.gower.csv',REPO_HOME)
part.df.occurrences<- sprintf('%s/results/maxent/clusters.raw.occurences.gower.csv',REPO_HOME)
part.df<- list(part.df.maxent, part.df.occurrences)
for (i in 1:2) {
partitions.df<- read.table(part.df[[i]], header = T, sep = ',', row.names = 1)
omi.data <- read.table(omi.file[[i]], header = T, sep = ',', row.names = 1)
trait.names <- names(omi.data)[2:length(omi.data)]
trait.taxa <- row.names(omi.data)
dom.taxa.clusters <- as.vector(unique(dplyr::select(filter(partitions.df,domesticated),cluster)))$cluster
magnitude.matrix <- matrix(nrow = length(trait.names), ncol = length(dom.taxa.clusters))
partitions.df$taxon.name <- row.names(partitions.df)
csv.file<- omi.file.maxent <- sprintf('%s/results/maxent/%s',REPO_HOME, omi.names[[i]])
for ( i in 1:length(trait.names) ) {
trait <- trait.names[i]
for ( j in 1:length(dom.taxa.clusters) ) {
c <- dom.taxa.clusters[j]
ingroup <- dplyr::select(filter(partitions.df, cluster == c),taxon.name)
outgroup <- dplyr::select(filter(partitions.df, cluster != c),taxon.name)
ingroup.mean <- mean(omi.data[ingroup$taxon.name,trait])
outgroup.mean <- mean(omi.data[outgroup$taxon.name,trait])
magnitude <- abs(ingroup.mean-outgroup.mean)
magnitude.matrix[i,j] <- magnitude
}
}
magnitude.df <- data.frame(magnitude.matrix)
names(magnitude.df) <- dom.taxa.clusters
row.names(magnitude.df) <- trait.names
write.csv(magnitude.df, csv.file)
}
Find cluster MRCAs
for (i in 1:length(list.names)) {
hclust.tree <- tree.list[[i]]
name.plot<- list.names[[i]]
hclust.phylo <- as.phylo(hclust.tree)
mrcas <- list()
mrcas.labels <- c('a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z', 'az', 'bz', 'cz', 'dz', 'ez', 'fz')
for ( i in 1:optimum.nclusters ) {
tips <- dplyr::select(filter(partitions.df,cluster==i),taxon.name)$taxon.name
l <- mrcas.labels[i]
mrcas[[l]] <- mrca.phylo(hclust.phylo, match(tips,hclust.phylo$tip.label))
}
### Plot the optimal partitioning
colors <- c(
'#5c5c7d',
'#0868b4',
'#0878a4',
'#65858e',
'#0a8ba3',
'#398796',
'#9ea09d',
'#828379',
'#545522',
'#8e8e15',
'#746113',
'#ac9331',
'#c9c6bc',
'#91700e',
'#956f31',
'#b69667',
'#362e25',
'#bb6314',
'#9b7551',
'#7c6653',
'#73411f',
'#c2734b',
'#8f5438',
'#bc4313',
'#99310f',
'#681c10',
'#FF7F50',
'#FF7F42',
'#220d0f',
'salmon',
'skyblue'
)
# prepare: coerce dendrogram to phylo; generate palette; instantiate edge color vector
#colors <- rainbow(optimum.nclusters)
edge.colors <- vector( mode = "character", length = length(hclust.phylo$edge.length) )
# calculate "ancestral states", for coloring the clades
anc.acctran <- ancestral.pars( hclust.phylo, as.phyDat(as.factor(partitions)), "ACCTRAN" )
for ( i in 1:length(anc.acctran) ) {
# i is node ID in the ancestral states table
states <- anc.acctran[[i]][1,]
# $edge is a 2-column adjacency list, 2nd column is child, values are node IDs
k <- match(i, hclust.phylo$edge[,2])
if ( max(states) == 1 ) {
j <- which.max(states)
edge.colors[k] <- colors[j]
} else {
edge.colors[k] <- '#000000'
}
}
# plot the tree
{
for ( i in 1:length(hclust.phylo$tip.label) ) {
index <- match(hclust.phylo$tip.label[i], dom.taxa)
if ( is.na(index) ) {
hclust.phylo$tip.label[i] <- ''
} else {
hclust.phylo$tip.label[i] <- index
}
}
plot.coord <- plot(
hclust.phylo,
edge.color = edge.colors,
type = "fan",
show.tip.label = T,
edge.width = 3,
font = 1,
label.offset = 0.01,
x.lim = c(-0.5, 0.5),
y.lim = c(-0.5, 0.5),
align.tip.label = T,
main= name.plot
)
for ( name in names(mrcas) ) {
arc.cladelabels(
hclust.phylo,
name,
node = mrcas[[name]],
ln.offset = 1.12,
lab.offset = 1.19,
font = 1
)
}
}
}