3. outlying mean index

Load the required libraries:

library(adehabitatHS, quietly = T)
library(raster, quietly = T)
library(SDMTools, quietly = T)
library(factoextra, quietly = T)
library(ecospat, quietly = T)
library(cluster, quietly = T)
library(ape, quietly=T)

Set the location of the root directory of the git repository

REPO_HOME <- paste(getwd(),'/../',sep='')

Import the taxa list

taxa.names <- scan(
  paste(
    REPO_HOME, 
    "/data/filtered/taxa.txt", 
    sep = ""
  ), 
  sep = "\n", 
  what = character()
)

Open the abiotic environmental raster layers in the GIS repository:

# Load bioclim data
gis.layers <- raster::getData(
  "worldclim",
  var = "bio",
  res = 5,
  path = paste(REPO_HOME, "/data/GIS", sep = ""),
  download = T
)

# The location of the TIFF file with the stacked layers described directly above
files.names <- list.files(paste(REPO_HOME, "/data/GIS/5_deg", sep = ""))

# Turn the file names into layer names: strip the prefix (which might include
# the resolution) and strip the file extension
gis.layers.names <- files.names
gis.layers.names <- gsub('current_5arcmin_','',gis.layers.names)
gis.layers.names <- gsub('.tif','',gis.layers.names)

# Combine the layer names with those we've already read from BIOCLIM
gis.layers.names <- c(names(gis.layers),gis.layers.names)

# Iterate over files
for (i in 1:length(files.names)) {
  
  # Stack with previously read layers
  gis.layers <- stack(
    gis.layers,
    
    # Read as raster
    raster(
      
      # Construct file name
      paste(REPO_HOME, "/data/GIS/5_deg/", files.names[i], sep = "")
    )
  )
}

# Apply all names
names(gis.layers) <- gis.layers.names
rm(gis.layers.names, files.names, i)

# Define CRS string
crs.string <- "+proj=longlat +datum=WGS84"

The raster layers are transformed to a spatialpixelsdataframe

gis.layers.spdf <- as(gis.layers, "SpatialPixelsDataFrame")
sp::proj4string(gis.layers.spdf) <- CRS(crs.string)

Transform the csv files with longitude and latitude values to a SpatialPointsDataFrame:

# two spatialPointsDataFrames are made 
# (i) a SpatialPointsDataFrame based on the raw occurence data
# (ii) a SpatialPointsDataFrame based on the MaxEnt projection layers

# (i) raw occurences 

## warning: the SpatialPointDataFrames are large files and therefore it might not be possible to run both datasets in the same run or even run all 152 species. The datasets can be split by for example changing 'i in length(taxa.names)' to 'i in 1:100' to calculate the normalized mean values for the first 100 species. Splitting the datasets in smaller subsets does not affect the outcomes because the normalization and averaging is done per species.

# create an empty SpatialPointDataFrame to populate in the following loop
spdf_raw_occurences <- new(
  "SpatialPointsDataFrame", 
  coords = structure(
    numeric(0), 
    .Dim = c(0L, 2L),
    .Dimnames = list( NULL, c("x", "y") )
  ),  
  bbox = structure(
    c(1,1,1,1), 
    .Dim = c(2L, 2L),                         
    .Dimnames = list( c("x","y"), c("min","max") )
  ),
  proj4string = new( "CRS", projargs = crs.string )
) 

# populate the empty dataframe with lat/lon values from the taxa list
# set for example i in 1:100 in the case of memory limits 
# 1:length(taxa.names)
for ( i in 1:10) {
  message(taxa.names[i])
  
  csv.file <- sprintf('%s/data/filtered/%s.csv', REPO_HOME, taxa.names[i])
  species_occurence<-as.matrix(read.csv(csv.file))
  name<- paste(taxa.names[i])
  length_df<- NROW(species_occurence)
  bindlonglat<- as.data.frame(cbind(species_occurence[, c("decimal_longitude", "decimal_latitude")]))
  points<- bindlonglat
  points$decimal_longitude<- as.numeric(as.character(points$decimal_longitude))
  points$decimal_latitude<- as.numeric(as.character(points$decimal_latitude))
  coordinates(points)<- ~ decimal_longitude + decimal_latitude
  spdf2<- SpatialPointsDataFrame(points, data.frame(species = rep(name, length_df)), proj4string = CRS)
  proj4string(spdf2) <- CRS("+proj=longlat +datum=WGS84")
  
  spdf_raw_occurences<- rbind(spdf_raw_occurences, spdf2)

}
## Aepyceros_melampus
## Alcelaphus_buselaphus
## Alcelaphus_caama
## Alces_alces
## Alces_americanus
## Ammotragus_lervia
## Antidorcas_marsupialis
## Antilocapra_americana
## Antilope_cervicapra
## Axis_axis
# (ii) MaxEnt projection layers 

# create an empty SpatialPointDataFrame to populate in the following loop
spdf_MaxEnt <- new(
  "SpatialPointsDataFrame", 
  coords = structure(
    numeric(0), 
    .Dim = c(0L, 2L),
    .Dimnames = list( NULL, c("x", "y") )
  ),  
  bbox = structure(
    c(1,1,1,1), 
    .Dim = c(2L, 2L),                         
    .Dimnames = list( c("x","y"), c("min","max") )
  ),
  proj4string = new( "CRS", projargs = crs.string )
) 

# populate the empty dataframe with lat/lon values from the taxa list
# 1:length(taxa.names)
for ( i in 1:10 ) {
  message(taxa.names[i])
  
  # construct file name
  valid.model.prediction <- sprintf(
    "%s/results/per_species/%s/valid_maxent_prediction_restricted.rda", 
    REPO_HOME, 
    taxa.names[i]
  )
  
  # load model prediction
  env <- new.env()
  nm <- load(valid.model.prediction, envir = env)[[1]]
  raster <- env[[nm]]
  
  # threshold the prediction and retain the remaining points in SPDF
  raster[raster == 0] <- NA
  points <- rasterToPoints( raster, spatial=F )
  spdf2 <- SpatialPointsDataFrame(
    points[,-3], 
    data.frame( species = rep( taxa.names[i], nrow(points) ) )
  )
  proj4string(spdf2) <- CRS(crs.string)
  spdf_MaxEnt <- rbind(spdf_MaxEnt, spdf2)
}
## Aepyceros_melampus
## Alcelaphus_buselaphus
## Alcelaphus_caama
## Alces_alces
## Alces_americanus
## Ammotragus_lervia
## Antidorcas_marsupialis
## Antilocapra_americana
## Antilope_cervicapra
## Axis_axis
rm(i,nm,valid.model.prediction,env,raster,spdf2,points, species_occurence, csv.file, gis.layers, bindlonglat)

Two data frames are constructed:

  1. a data frame (dfavail) with the values in the SpatialPixelsDataframe, i.e. the traits
  2. a data frame (dfused) with the amount of occurrences per pixel

The constructed dataframes are used in a principal component analysis (PCA). A PCA analysis is conducted to standardize all the environmental variables. This standardized dataframe is used a an input dataset for the niche function to calculate the normalized averaged niche traits per species.

# a loop is used to create the used and available dataframes for (i) raw occurences, (ii) MaxEnt projections 
# (i) spdf_raw_occurences
# (ii) spdf_MaxEnt

files<- list(spdf_raw_occurences, spdf_MaxEnt)
names<- list("normalized_raw_values", "normalized_MaxEnt_values")

for (i in 1:length(names)) {
niche.file <- sprintf("%s/Results/OMI/%s.csv", REPO_HOME, names[i])

if (!file.exists(niche.file)) {
cp <- count.points(files[[i]], gis.layers.spdf)

dfavail <- slot(gis.layers.spdf, "data")
dfavail <- tibble::rowid_to_column(dfavail, "ID")
dfavail <- na.omit(dfavail)

dfused <- slot(cp, "data")
dfused <- tibble::rowid_to_column(dfused, "ID")

dfused <- subset(dfused, dfused$ID %in% dfavail$ID )
dfused <- subset(dfused, select = -ID )

# PCA analysis to collect and standardize the environmental variables per occurrence point
dud <- dudi.pca(dfavail[2:42], scannf=F)
rm(dfavail, cp)

# niche analysis to average the standardized values fromt the PCA per species
nic<- niche(dud, dfused, scannf=F)

# shows the averages per species for the standardized environmental variables
p <- nic$tab

# add domesticated TRUE or FALSE
# vector of all domesticated taxa
  dom.taxa <- c(
    "Bos_frontalis_gaurus", "Bos_grunniens_mutus", "Bos_javanicus", "Bos_taurus_primigenius",
    "Bubalus_bubalis_arnee", "Camelus_bactrianus", "Camelus_dromedarius", "Capra_hircus_aegagrus",
    "Equus_przewalskii", "Lama_glama_guanicoe", "Ovis_aries_orientalis",
    "Rangifer_tarandus", "Sus_scrofa", "Vicugna_vicugna"
  )

# create empty column
p["IsDomesticated"]<- NA 

# iterate over taxa
  for ( i in 1:length(taxa.names) ) {
    taxon.name <- taxa.names[i]
    message(taxon.name)
    
    # store whether this taxon is domesticated
    if ( taxon.name %in% dom.taxa ) {
      p[i,42] <- T
    } else {
      p[i,42] <- F
    }
  }
# write to file
write.csv(p, niche.file)

}

}

# clean up
rm(p,nic)
## Warning in rm(p, nic): object 'p' not found
## Warning in rm(p, nic): object 'nic' not found

Using the OMI values we can now calculate the distance between all pairs of species. For this we use Gower’s distance. Given the distance matrix we then construct dendrograms using hierarchical clustering (hclust) and neighbor-joining (nj).

# load the variable importance constructed in the previous script 2_variable_importance.rmd
mean.traits.contributions.csv <- paste(REPO_HOME, "results/maxent/model_summaries/mean_traits_contribution_maxent.csv", sep="")
mean.traits<- read.csv(mean.traits.contributions.csv)
mean.importance<- as.numeric(mean.traits[,2])
names.omi.files<- list("raw_omi_probeer", "maxent_omi_probeer")
for (i in 1:length(names)) {

# name for the gowers distance dataframe
Omi.file<- paste(names.omi.files[i], '.csv', sep='')
overlap.csv <- sprintf("%s/results/maxent/comparative/gower/%s.csv", REPO_HOME, Omi.file)
if (!file.exists(overlap.csv)) {
  
# load the normalized values from the github repository
niche.file <- sprintf("%s/results/OMI/%s.csv", REPO_HOME, names[i])
normalized.values <- read.csv(niche.file, header=TRUE, row.names = 1)

# save gowers dataframe 
gow <- daisy(normalized.values[1:41], metric="gower", stand=FALSE, weights = mean.importance )
gow_df <- as.matrix(gow)

# save gowers dataframe as a csv file
write.csv(gow_df, overlap.csv)

# name for the hclust tree
Omi.hclust<- paste(names.omi.files[i], '_hclust.tree', sep='')
Omi.hclust <- sprintf("%s/results/maxent/comparative/gower/%s.csv", REPO_HOME, Omi.hclust)

# create tree
# hclust tree is used to plot the dendrogram
distances <- as.dist(gow_df)
hclust.tree <- hclust(distances)
tree <- as.phylo(hclust.tree)

# write tree 
write.tree(tree, file = Omi.hclust)

# name for the nj tree
Omi.njtree<- paste(names.omi.files[i], '_omi_nj.tree', sep='')
Omi.njtree <- sprintf("%s/results/maxent/comparative/gower/%s.csv", REPO_HOME, Omi.njtree)

# nj tree is used to calculate the distances
nj<- nj(distances)
nj.tree<- as.phylo(nj)

write.tree(nj.tree, file = Omi.njtree)
}
}

Now we can assess whether domesticated Ungulates come from specific areas in niche space:

# here we calculate the average pairwise patristic distance between domesticates and domesticates and domesticates and wild Ungulates. 

# load nj trees. 
nj.tree.raw.occurrences<- read.tree(paste(REPO_HOME, "/Results/maxent/comparative/gower/raw_omi_nj.tree", sep=""))
nj.tree.maxent.trait.projections<- read.tree(paste(REPO_HOME, "/Results/maxent/comparative/gower/maxent_omi_nj.tree", sep=""))
nj.tree.maxent.projections<- read.tree(paste(REPO_HOME, "/Results/maxent/comparative/schoener/inverse_overlap_nj.tree", sep=""))
  
names.list<- list("clustering_nj_raw_occurences_probeer.pdf","clustering_nj_MaxEnt_occurences_probeer.pdf","clustering_nj_schoener_probeer.pdf")
list.trees<- list(nj.tree.raw.occurrences, nj.tree.maxent.trait.projections, nj.tree.maxent.projections)

for (i in 1:length(names.list)) {
  
clustering.pdf.file <- sprintf("%s/results/maxent/%s", REPO_HOME, names.list[i] )

if (!file.exists(clustering.pdf.file)) {
  
nj.tree<- list.trees[[i]]

# vector of all tips
dom.taxa <- c(
  "Bos_frontalis_gaurus", "Bos_grunniens_mutus", "Bos_javanicus", "Bos_taurus_primigenius",
  "Bubalus_bubalis_arnee", "Camelus_bactrianus", "Camelus_dromedarius", "Capra_hircus_aegagrus",
  "Equus_przewalskii", "Lama_glama_guanicoe", "Ovis_aries_orientalis",
  "Rangifer_tarandus", "Sus_scrofa", "Vicugna_vicugna"
)

# tips in tree
dom.tips <- nj.tree$tip.label[ which(nj.tree$tip.label %in% dom.taxa) ]

# this function calculates the average pairwise patristic distance among an
# input vector of tip labels
mean_pw_distance <- function(nj.tree, tip.vector) {

  # calculate size of distance matrix, instantiate vector
  nt <- length(tip.vector)
  nd <- ((nt * nt) - nt) / 2
  dist.vector <- vector(mode = "numeric", length = nd)

  # do all pairwise comparisons
  k <- 1
  max <- nt - 1
  for (i in 1:max) {
    min <- i + 1
    for (j in min:nt) {
      dist.vector[k] <- phytools::fastDist(nj.tree, tip.vector[i], tip.vector[j])
      k <- k + 1
    }
  }
  return(mean(dist.vector))
}

# calculate average pairwise distance (APD) among domesticates
dom.dist <- mean_pw_distance(nj.tree, dom.tips)

# take random tip samples, compute APD among these many time
sam.num <- 1000
sam.dist <- vector(mode = "numeric", length = sam.num)
for (i in 1:sam.num) {
  sam.tips <- sample(nj.tree$tip.label, length(dom.tips), replace = F)
  sam.dist[i] <- mean_pw_distance(nj.tree, sam.tips)
}


{
  pdf(file = clustering.pdf.file)
  h <- hist(
    sam.dist,
    xlab = sprintf("Mean pairwise patristic distance in random sample of %i tips (n=%i)", length(dom.tips), sam.num),
    col = "lightblue",
    main = "Domesticated Ungulates cluster in niche space"
  )

  # fit normal distribution
  xfit <- seq(min(sam.dist), max(sam.dist), length = length(sam.dist))
  yfit <- dnorm(xfit, mean = mean(sam.dist), sd = sd(sam.dist))
  yfit <- yfit * diff(h$mids[1:2]) * length(sam.dist)
  lines(xfit, yfit, col = "black", lwd = 2)

  # draw ± 2 * standard deviation
  abline(v = mean(sam.dist) + 2 * sd(sam.dist), col = "green")
  abline(v = mean(sam.dist) - 2 * sd(sam.dist), col = "green")

  # indicate domesticated APD
  abline(v = dom.dist, col = "red")
  dev.off()
}
}
}