# Ancestral state reconstruction (discrete) # Once we have our phylogeny and data paired, we can reconstruct character evolution by # inferring ancestral character states. # 1) Make a working directory (e.g., discrete) and select it as your working. setwd("/Users/jcsantos/Desktop/R_class_winter_2015_home/0_Bio559R_course_final_files/week_8/data/") # 2) We need to download these files from the course website into our working directory: bird_data_pruned.txt bird_pruned_MB.newick tyrannidae_reduced_data.txt tyrannidae_reduced_tree.newick # 3) Several packages and functions are available for this purpose. library(ape) library(geiger) library(phytools) # 4) Load our trees and data from last class and check concordance: bird_pruned_tree <- read.tree("bird_pruned_MB.newick") bird_pruned_tree birds_traits_data <- read.table("bird_data_pruned.txt", header = TRUE, sep = "\t") birds_traits_data # use the 'name.check' function to determine if we have concordance name.check (bird_pruned_tree, birds_traits_data) # we check the concordance between a data file and a phylogenetic tree. # [1] "OK" # we have concordance (i.e., we the same number of tips and data per tip) # this is your exercise: tyrannidae_reduced_tree <- read.tree("tyrannidae_reduced_tree.newick") tyrannidae_reduced_data <- read.table("tyrannidae_reduced_data.txt", header = TRUE, sep = "\t") name.check (tyrannidae_reduced_tree, tyrannidae_reduced_data) # we check the concordance between a data file and a phylogenetic tree. # [1] "OK" # we have concordance #5) We need to make sure that our tree is ultrametric tree and preferably bifurcating. is.ultrametric(bird_pruned_tree) # TRUE is.binary.tree(bird_pruned_tree) # TRUE all nodes (including the root node) have exactly two descendant nodes is.ultrametric(tyrannidae_reduced_tree) # TRUE is.binary.tree(tyrannidae_reduced_tree) # TRUE all nodes (including the root node) have exactly two descendant nodes #6) To plot, we need to sort variable values to match phylogeny names order match_birds_traits_data <- match(bird_pruned_tree$tip.label, rownames(birds_traits_data)) #use match function sorted_birds_traits_data <- as.data.frame(birds_traits_data[match_birds_traits_data,]) sorted_birds_traits_data #7) Prepare data for plotting the Food discrete traut. We are going to use colored label for discrete traits #Food Food_birds_label <- character(length(sorted_birds_traits_data$Food)) names(Food_birds_label) <- rownames(sorted_birds_traits_data) #get the unique states for Time unique(sorted_birds_traits_data$Food) #[1] vegetarian carnivore omnivore #label states with colors for plot Food_birds_label[sorted_birds_traits_data$Food=="vegetarian"] <- "green3" #assign color Food_birds_label[sorted_birds_traits_data$Food=="carnivore"] <- "red" #assign color Food_birds_label[sorted_birds_traits_data$Food=="omnivore"] <- "darkorchid" #assign color Food_birds_label #7) plot data plot(bird_pruned_tree, cex=1, show.tip.label = FALSE) #Plot a tree that leaves some room between the tree tips and taxon labels so that we can plot habitat use in this space points(rep(122.5, nrow(sorted_birds_traits_data)), 1:nrow(sorted_birds_traits_data), pch=22, bg=Food_birds_label[bird_pruned_tree$tip.label], lwd = 0.05, col="white", cex=1.5) #8) We can use the ace (Ancestral Character Estimation) function of ape to reconstruct ancestral character states using maximum likelihood: # ace(x, phy, type = "discrete", # method = "ML", # only option for discrete characters, following Pagel 1994 # CI = TRUE, # 95% confidence intervals # model = "ER", # Four option, equal rates (ER), symmetrical (SYM), All-rates different (ARD), custom matrix # scaled = TRUE, # kappa = 1, # if branch lengths needs transformation, a positive value giving the exponent for such transformation # corStruct = NULL, # ip = 0.1, # use.expm = FALSE, # use.eigen = TRUE, # marginal = FALSE) # returns the node marginal reconstruction (likelihood values) at each node # 9) Let's explore the "ER" model: food_bird_ace_ER <- ace(birds_traits_data$Food, bird_pruned_tree, type="discrete", model = "ER", marginal = TRUE) #Equal rates food_bird_ace_ER # Ancestral Character Estimation # # Call: ace(x = birds_traits_data$Food, phy = bird_pruned_tree, type = "discrete", model = "ER") # # Log-likelihood: -406.578 # # Rate index matrix: # carnivore omnivore vegetarian # carnivore . 1 1 # omnivore 1 . 1 # vegetarian 1 1 . # # Parameter estimates: # rate index estimate std-err # 1 0.0083 7e-04 # # Scaled likelihoods at the root (type '...$lik.anc' to get them for all nodes): # carnivore omnivore vegetarian # 0.5489050 0.1842301 0.2668649 #10) you can call the marginal probabilities at any node along the tree food_bird_ace_ER$lik.anc # we get the marginal values for each node each should add to ~1.0 # carnivore omnivore vegetarian # [1,] 5.489050e-01 1.842301e-01 2.668649e-01 # [2,] 3.832150e-01 3.246254e-01 2.921596e-01 # [3,] 5.018158e-01 1.908888e-01 3.072954e-01 # [4,] 7.351661e-01 1.519711e-01 1.128628e-01 # [5,] 8.025606e-01 1.880971e-01 9.342243e-03 #11) Let's explore the ”SYM" model: food_bird_ace_SYM <- ace(birds_traits_data$Food, bird_pruned_tree, type="discrete", model = "SYM") #Symmetrical food_bird_ace_SYM # Ancestral Character Estimation # # Call: ace(x = birds_traits_data$Food, phy = bird_pruned_tree, type = "discrete", model = "SYM") # # Log-likelihood: -401.9363 # # Rate index matrix: # carnivore omnivore vegetarian # carnivore . 1 2 # omnivore 1 . 3 # vegetarian 2 3 . # # Parameter estimates: # rate index estimate std-err # 1 0.0111 0.0016 # 2 0.0044 0.0009 # 3 0.0129 0.0024 # # Scaled likelihoods at the root (type '...$lik.anc' to get them for all nodes): # carnivore omnivore vegetarian # 0.5452463 0.2518925 0.2028612 #12) Let's explore the ”ARD" model: food_bird_ace_ARD <- ace(birds_traits_data$Food, bird_pruned_tree, type="discrete", model = "ARD") #All-rates different food_bird_ace_ARD # Ancestral Character Estimation # # Call: ace(x = birds_traits_data$Food, phy = bird_pruned_tree, type = "discrete", model = "ARD") # # Log-likelihood: -386.3962 # # Rate index matrix: # carnivore omnivore vegetarian # carnivore . 3 5 # omnivore 1 . 6 # vegetarian 2 4 . # # Parameter estimates: # rate index estimate std-err # 1 0.0174 0.0025 # 2 0.0095 0.0017 # 3 0.0064 0.0014 # 4 0.0056 0.0017 # 5 0.0005 0.0006 # 6 0.0153 0.0024 # # Scaled likelihoods at the root (type '...$lik.anc' to get them for all nodes): # carnivore omnivore vegetarian # 0.0379084 0.2905876 0.6715040 #13) We can plot the marginal likelihoods for each state reconstruction at each node using pie diagrams: #Food # We need to sort variable values to match phylogeny names order match_birds_data <- match(bird_pruned_tree$tip.label, rownames(birds_traits_data)) #use match function sorted_birds_data <- as.data.frame(birds_traits_data[match_birds_data,]) sorted_birds_data #labels Food_birds_label <- character(length(sorted_birds_data$Food)) names(Food_birds_label) <- rownames(sorted_birds_data) #get the unique states for Food unique(sorted_birds_data$Food) #[1] vegetarian carnivore omnivore #label states with colors for plot Food_birds_label[sorted_birds_data$Food=="vegetarian"] <- "vegetarian" #assign color Food_birds_label[sorted_birds_data$Food=="carnivore"] <- "carnivore" #assign color Food_birds_label[sorted_birds_data$Food=="omnivore"] <- "omnivore" #assign color Food_birds_label # 14) To create a binary matrix for our characters (useful for drawing) # we use 'to.matrix’ function of phytools food_pie_matrix <- to.matrix(Food_birds_label, sort(unique(Food_birds_label))) food_pie_matrix # carnivore omnivore vegetarian # Nothoprocta_perdicaria 0 0 1 # Apteryx_australis 1 0 0 # Apteryx_haastii 1 0 0 #15) plot data plot(bird_pruned_tree, show.node.label=FALSE, show.tip.label = FALSE, label.offset=3, cex=0.4, type = "fan") tiplabels(pie = food_pie_matrix[bird_pruned_tree$tip.label, ], piecol = c("red", "darkorchid", "green3"), cex = 0.12) nodelabels(pie=food_bird_ace_ER$lik.anc, piecol=c("red", "darkorchid", "green3"), cex = 0.12) axisPhylo() plot(bird_pruned_tree, show.node.label=FALSE, show.tip.label = FALSE, label.offset=3, cex=0.4) tiplabels(pie = food_pie_matrix[bird_pruned_tree$tip.label, ], piecol = c("red", "darkorchid", "green3"), cex = 0.12) nodelabels(pie=food_bird_ace_SYM$lik.anc, piecol=c("red", "darkorchid", "green3"), cex = 0.12) plot(bird_pruned_tree, show.node.label=FALSE, show.tip.label = FALSE, label.offset=3, cex=0.4) tiplabels(pie = food_pie_matrix[bird_pruned_tree$tip.label, ], piecol = c("red", "darkorchid", "green3"), cex = 0.12) nodelabels(pie=food_bird_ace_ARD$lik.anc, piecol=c("red", "darkorchid", "green3"), cex = 0.12) axisPhylo() #Other references: http://www.phytools.org/eqg/Exercise_5.2/ #Other references: http://www.evovert.com/R_using_Phylogenies.html ##################################### GEIGER ############################################# # One of the best and more flexible methods are implemented using # the ‘fitDiscrete’ function in 'geiger’. This method provides a set of statistics that allows #us to decide which model best describes the trait that we want to reconstruct. #Notice: This also depends on our knowledge about our system of study. # 16) First, we need to prepare our data using the function 'treedata' Food_birds_geiger <- treedata(bird_pruned_tree, birds_traits_data) Food_birds_geiger$phy # a class phylo Food_birds_geiger$data # a matrix # 17) Let's explore the 'fitDiscrete' function ?fitDiscrete # The 'fitDiscrete' allows to model fitting for discrete comparative data # This function returns parameter estimates and the likelihood for univariate datasets. # All of the models are continuous-time Markov models of trait evolution. # This type of models allow modeling a random process where the probability of change # between character states depends only on the current state. For a binary trait # (i.e., 0-absent, 1-present) there should be at any time the chance from change from # 0-absent to 1-present or viceversa. This is represented by the instantaneous rate matrix (Q) #Basic models # ER is an equal-rates model of where a single parameter governs all transition rates # same rate: 0 <-> 1, 0 <-> 2, 1 <-> 2 # SYM is a symmetric model where forward and reverse transitions share the same parameter # rate-1: 0 <-> 1, rate-2: 0 <-> 2, rate-3: 1 <-> 2 # ARD is an all-rates-different model where each rate is a unique parameter # all different rates #18) Let's estimate the ER model food_bird_ace_ER_geiger <- fitDiscrete(bird_pruned_tree, Food_birds_geiger$data [,13], type="discrete", model = "ER") food_bird_ace_ER_geiger #GEIGER-fitted comparative model of discrete data # fitted Q matrix: <-- This is the instantaneous matrix # carnivore omnivore vegetarian # carnivore -0.010154425 0.005077212 0.005077212 # omnivore 0.005077212 -0.010154425 0.005077212 # vegetarian 0.005077212 0.005077212 -0.010154425 # # model summary: <-- This is set of statistics to compare models # log-likelihood = -326.374043 # AIC = 654.748086 # AICc = 654.756994 # free parameters = 1 # # Convergence diagnostics: <-- Report in the convergence # optimization iterations = 100 # failed iterations = 0 # frequency of best fit = 1.00 # # object summary: <-- objects that can be called for analyses # 'lik' -- likelihood function # 'bnd' -- bounds for likelihood search # 'res' -- optimization iteration summary # 'opt' -- maximum likelihood parameter estimates # 19) Let's estimate the SYM model food_bird_ace_SYM_geiger <- fitDiscrete(bird_pruned_tree, Food_birds_geiger$data [,13], type="discrete", model = "SYM") food_bird_ace_SYM_geiger #GEIGER-fitted comparative model of discrete data # fitted Q matrix: # carnivore omnivore vegetarian # carnivore -0.007866836 0.005917884 0.001948951 # omnivore 0.005917884 -0.017907770 0.011989886 # vegetarian 0.001948951 0.011989886 -0.013938837 # # model summary: # log-likelihood = -315.365882 # AIC = 636.731764 # AICc = 636.785455 # free parameters = 3 # #Convergence diagnostics: # optimization iterations = 100 # failed iterations = 0 # frequency of best fit = 0.06 # # object summary: # 'lik' -- likelihood function # 'bnd' -- bounds for likelihood search # 'res' -- optimization iteration summary # 'opt' -- maximum likelihood parameter estimates # 20) Let's estimate the ARD model food_bird_ace_ARD_geiger <- fitDiscrete(bird_pruned_tree, Food_birds_geiger$data [,13], type="discrete", model = "ARD") food_bird_ace_ARD_geiger # GEIGER-fitted comparative model of discrete data # fitted Q matrix: # carnivore omnivore vegetarian # carnivore -0.002923285 0.002923285 8.825334e-20 # omnivore 0.017452911 -0.032781504 1.532859e-02 # vegetarian 0.001587524 0.006893189 -8.480713e-03 # # model summary: # log-likelihood = -304.427936 # AIC = 620.855872 # AICc = 621.045062 # free parameters = 6 # #Convergence diagnostics: # optimization iterations = 100 # failed iterations = 0 # frequency of best fit = 0.06 # # object summary: # 'lik' -- likelihood function # 'bnd' -- bounds for likelihood search # 'res' -- optimization iteration summary # 'opt' -- maximum likelihood parameter estimates # 21) Let's compare the model outcomes: # model summary: # log-likelihood = -315.365882 # AIC = 636.731764 # AICc = 636.785455 # free parameters = 3 food_bird_ace_ARD_geiger # model summary: # log-likelihood = -304.427936 # AIC = 620.855872 # AICc = 621.045062 $ free parameters = 6 # 22) Let's compare models based on the summaries # parameters # free parameters = 1 #food_bird_ace_ER_geiger -- most simple # free parameters = 3 #food_bird_ace_SYM_geiger # free parameters = 6 #food_bird_ace_ADR_geiger -- most complex # log-likelihood # log-likelihood = -326.374043 #food_bird_ace_ER_geiger # log-likelihood = -315.365882 #food_bird_ace_SYM_geiger # log-likelihood = -304.427936 #food_bird_ace_ADR_geiger # AICc # AICc = 654.756994 #food_bird_ace_ER_geiger # AICc = 636.785455 #food_bird_ace_SYM_geiger # AICc = 621.045062 #food_bird_ace_ADR_geiger # 23) For model comparison, the model with the lowest AIC score is preferred # consider # |AICc_model1 - AICc_model2| < 2-3 (no support for the more complex model) # |AICc_model1 - AICc_model2| = 4-9 (some support for the more complex model) # |AICc_model1 - AICc_model2| > 10 (so support for the simple model) ER_versus_SYM <- abs(food_bird_ace_ER_geiger$opt$aicc - food_bird_ace_SYM_geiger$opt$aicc) ER_versus_SYM # 17.97154 -- no support from ER ER_versus_ARD <- abs(food_bird_ace_ER_geiger$opt$aicc - food_bird_ace_ARD_geiger$opt$aicc) ER_versus_ARD # 33.71193 -- no support from ER SYM_versus_ARD <- abs(food_bird_ace_SYM_geiger$opt$aicc - food_bird_ace_ARD_geiger$opt$aicc) SYM_versus_ARD # 15.74039 -- no support from SYM #Delta AICc suggest that ARD is the best model # 24) We can compare the likelihood of the two models using a likelihood ratio # test. You can calculate the p-value of the likelihood ratio test in R by comparing it # to the chi-square distribution with the degrees of freedom are equal to the difference # in number of parameters for the two models. ##### ER versus SYM d_ER_versus_SYM <- abs(2*(food_bird_ace_ER_geiger$opt$lnL-food_bird_ace_SYM_geiger$opt$lnL)) d_ER_versus_SYM #[1] 22.01632 p_value_ER_versus_SYM <- pchisq(d_ER_versus_SYM, 3-1, lower.tail=FALSE) p_value_ER_versus_SYM #[1] 1.656595e-05 -- Rejects ER-model ##### ER versus ARD d_ER_versus_ARD <- abs(2*(food_bird_ace_ER_geiger$opt$lnL-food_bird_ace_ARD_geiger$opt$lnL)) d_ER_versus_ARD #[1] 43.89221 p_value_ER_versus_ARD <- pchisq(d_ER_versus_ARD, 6-1, lower.tail=FALSE) p_value_ER_versus_ARD #[1] 2.435899e-08 -- Rejects ER-model ##### SYM versus ARD d_SYM_versus_ARD <- abs(2*(food_bird_ace_SYM_geiger$opt$lnL-food_bird_ace_ARD_geiger$opt$lnL)) d_SYM_versus_ARD #[1] 21.87589 p_value_SYM_versus_ARD <- pchisq(d_SYM_versus_ARD, 6-3, lower.tail=FALSE) p_value_SYM_versus_ARD #[1] 6.922698e-05 -- Rejects SYM-model ### We can conclude that ARD does a better job than the other models to trace the evolution ### of food trait in birds #################### Phylogenetic Signal for Discrete Characters ######################### # We can use Pagel's lambda to assesses the phylogenetic signal (tendency of closely related # species to resemble for an specific trait). Lambda parameter is used as a tree transformation # parameter that multiply the internal branches of the tree by values between 0 and 1. # A lambda of 1 indicates strong signal # A lambda of 0 indicates son signal # We can determine the magnitude of the signal by estimating the lambda in our tree and then # compare the Log-lik with that of tree transformed to a polytomy (i.e., branches multiplied by # lambda = 0) # Pagel, M. 1994. Detecting correlated evolution on phylogenies: a general method for the comparative analysis of # discrete characters. Proceedings of the Royal Society of London B 255:37-45. # 25) Let's estimate our lambda 0 tree bird_pruned_tree_lambda_0 <- rescale(bird_pruned_tree, model = "lambda", 0) #Let's plot this tree and the original bird tree par(mfrow=c(1,2)) plot(bird_pruned_tree, edge.width = 1, show.tip.label = FALSE) #smaller tree add.scale.bar(cex = 0.7, font = 2, col = "red") plot(bird_pruned_tree_lambda_0, edge.width = 1, show.tip.label = FALSE) #smaller tree add.scale.bar(cex = 0.7, font = 2, col = "red") #26) Let's fit the lambda transformation to determine if there is signal # Fit lambda to our tree food_bird_ace_ARD_geiger_lambda <- fitDiscrete(bird_pruned_tree, Food_birds_geiger$data [,13], type="discrete", model = "ARD", transform = "lambda", niter = 500) food_bird_ace_ARD_geiger_lambda # GEIGER-fitted comparative model of discrete data # fitted Q matrix: # carnivore omnivore vegetarian # carnivore -1.950375e-03 0.001950375 4.052955e-16 # omnivore 1.255327e-02 -0.024693463 1.214019e-02 # vegetarian 1.481506e-16 0.001482542 -1.482542e-03 # # fitted ‘lambda’ model parameter: # lambda = 0.906137 # # model summary: # log-likelihood = -290.226291 # AIC = 594.452583 # AICc = 594.705404 # free parameters = 7 # # Convergence diagnostics: # optimization iterations = 100 # failed iterations = 63 # frequency of best fit = NA # # object summary: # 'lik' -- likelihood function # 'bnd' -- bounds for likelihood search # 'res' -- optimization iteration summary # 'opt' -- maximum likelihood parameter estimates # 27) Fit the lamda 0 tree (no signal) food_bird_ace_ARD_geiger_lambda_0 <- fitDiscrete(bird_pruned_tree_lambda_0, Food_birds_geiger$data [,13], type="discrete", model = "ARD", transform = "lambda", niter = 500) #GEIGER-fitted comparative model of discrete data # fitted Q matrix: # carnivore omnivore vegetarian # carnivore -0.0069920757 0.0029614817 0.0040305940 # omnivore 0.0005590967 -0.0013650835 0.0008059868 # vegetarian 0.0005886789 0.0007984343 -0.0013871132 # # fitted ‘lambda’ model parameter: # lambda = 0.069683 # # model summary: # log-likelihood = -479.923127 # AIC = 973.846253 # AICc = 974.099075 # free parameters = 7 # #Convergence diagnostics: # optimization iterations = 100 # failed iterations = 0 # frequency of best fit = 0.45 #28) For model comparison, the model with the lowest AIC score is preferred our_tree_versus_lambda_0_aicc <- abs(food_bird_ace_ARD_geiger_lambda_0$opt$aicc - food_bird_ace_ARD_geiger_lambda$opt$aicc) our_tree_versus_lambda_0_aicc # 379.3937 -- lambda 0 tree (no signal) is very unlikely ##### our tree versus lambda 0 d_our_tree_versus_lambda_0 <- abs(2*(food_bird_ace_ARD_geiger_lambda_0$opt$lnL-food_bird_ace_ARD_geiger_lambda$opt$lnL)) d_our_tree_versus_lambda_0 #[1] 379.3937 p_our_tree_versus_lambda_0 <- pchisq(d_our_tree_versus_lambda_0, 1, lower.tail=FALSE) p_our_tree_versus_lambda_0 #[1] 1.686434e-84 -- Rejects no signal model ########### Testing if rates have increased/decreased Discrete Characters ################ # 29) We can test whether rates have increased or slowed over evolutionary time # using 'fitDiscrete'. We can use the delta model. # The delta model fits the relative contributions of early versus late evolution # in the tree to the covariance of species trait values. # If delta > 1, recent evolution has been relatively fast # If delta < 1, recent evolution has been comparatively slow. # If rates decrease over time it is a signature of adaptive radiation if the traits # are ecologically relevant. food_bird_ace_ARD_geiger_delta <- fitDiscrete(bird_pruned_tree, Food_birds_geiger$data [,13], type="discrete", model = "ARD", transform = "delta") #GEIGER-fitted comparative model of discrete data # fitted Q matrix: # carnivore omnivore vegetarian # carnivore -0.0018102415 0.001810241 2.114637e-38 # omnivore 0.0120647007 -0.022792976 1.072828e-02 # vegetarian 0.0004968849 0.003132820 -3.629705e-03 # # fitted ‘delta’ model parameter: # delta = 1.691392 # # model summary: # log-likelihood = -303.756563 # AIC = 621.513126 # AICc = 621.765947 # free parameters = 7 # # Convergence diagnostics: # optimization iterations = 100 # failed iterations = 0 # frequency of best fit = 0.01 # # object summary: # 'lik' -- likelihood function # 'bnd' -- bounds for likelihood search # 'res' -- optimization iteration summary # 'opt' -- maximum likelihood parameter estimates #36) compare the aicc values # AICc = 621.765947 for delta # AICc = 654.756994 for equal rates # For model comparison, the model with the lowest AIC score is preferred # difference in AICc is 32.991047. Then delta model is supported # fitted ‘delta’ model parameter: # delta = 1.691392 # The delta > 1 suggest that recent evolution has been relatively fast ########################################################################################## ########################################################################################## ########################################################################################## ##########################################################################################