#Taxonomic classification and distribution data from Gbif using R #0) packages relevant for taxonomy and distribution #http://ropensci.org/tutorials/rgbif_tutorial.html #http://ropensci.org/tutorials/taxize_tutorial.html install.packages("taxize") install.packages("rgbif") #1) load libraries library("taxize") library("rgbif") #2) set working directory setwd("~/Desktop/R_class_winter_2015_home/0_Bio559R_course_final_files/week_13/data") # 3) Load our data from last class: birds_traits_data <- read.table("bird_data_pruned.txt", header = TRUE, sep = "\t") birds_traits_list <- read.table("species_list.txt", header = TRUE, sep = "\t") # 4) Add species list to main data birds_traits_data_list <- cbind(birds_traits_list,birds_traits_data) # merge two data frames birds_traits_data_list$Species_list <- as.character(birds_traits_data_list$Species_list) # transform from factor to character # 5) However, you can also work using only the species list dataframe species <- c(t(birds_traits_list)) # as a vector species # 6) Taxonomy hierarchy using diverse databases: # We can get full classification using A number of data sources in taxize provide the # capability to retrieve higher taxonomic names, but we will highlight two of the more classfication_sp_ncbi <- lapply (1:2, function (x) {classification(species[x], db = 'ncbi')}) # National Center for Biotechnology Information classfication_sp_ncbi # [[1]] # $`Struthio camelus` # name rank id # 1 cellular organisms no rank 131567 # 2 Eukaryota superkingdom 2759 # 3 Opisthokonta no rank 33154 # 4 Metazoa kingdom 33208 # 5 Eumetazoa no rank 6072 # 6 Bilateria no rank 33213 # 7 Deuterostomia no rank 33511 # 8 Chordata phylum 7711 # 9 Craniata subphylum 89593 # 10 Vertebrata no rank 7742 # 11 Gnathostomata no rank 7776 # 12 Teleostomi no rank 117570 # 13 Euteleostomi no rank 117571 # 14 Sarcopterygii no rank 8287 # 15 Dipnotetrapodomorpha no rank 1338369 # 16 Tetrapoda no rank 32523 # 17 Amniota no rank 32524 # 18 Sauropsida no rank 8457 # 19 Sauria no rank 32561 # 20 Archelosauria no rank 1329799 # 21 Archosauria no rank 8492 # 22 Dinosauria no rank 436486 # 23 Saurischia no rank 436489 # 24 Theropoda no rank 436491 # 25 Coelurosauria no rank 436492 # 26 Aves class 8782 # 27 Palaeognathae superorder 8783 # 28 Struthioniformes order 8798 # 29 Struthionidae family 8799 # 30 Struthio genus 8800 # 31 Struthio camelus species 8801 classfication_sp_gbif <- lapply (1:2, function (x) {classification(species[x], db = 'gbif')}) # Global Biodiversity Information Facility classfication_sp_gbif # [[1]] # $`Struthio camelus` # name rank id # 1 Animalia kingdom 1 # 2 Chordata phylum 44 # 3 Aves class 212 # 4 Struthioniformes order 725 # 5 Struthionidae family 9349 # 6 Struthio genus 2495148 # 7 Struthio camelus species 2495150 classfication_sp_itis <- lapply (1:2, function (x) {classification(species[x], db = 'itis')}) # Integrated Taxonomic Information System classfication_sp_itis # [[1]] # $`Struthio camelus` # name rank id # 1 Animalia Kingdom 202423 # 2 Bilateria Subkingdom 914154 # 3 Deuterostomia Infrakingdom 914156 # 4 Chordata Phylum 158852 # 5 Vertebrata Subphylum 331030 # 6 Gnathostomata Infraphylum 914179 # 7 Tetrapoda Superclass 914181 # 8 Aves Class 174371 # 9 Struthioniformes Order 174372 # 10 Struthionidae Family 174373 # 11 Struthio Genus 174374 # 12 Struthio camelus Species 174375 classfication_sp_col <- lapply (1:2, function (x) {classification(species[x], db = 'col')}) # Catalogue of Life classfication_sp_col # [[1]] # $`Struthio camelus` # name rank id # 1 Animalia Kingdom 22032961 # 2 Chordata Phylum 22032976 # 3 Aves Class 22033651 # 4 Struthioniformes Order 22041922 # 5 Struthionidae Family 22041927 # 6 Struthio Genus 22044859 # 7 Struthio camelus Species 11908938 ######################################################################################################################## ############### Georeferencing uisng GBIF ## 7) Choosing our species of interest species[1] # [1] "Struthio camelus" ### 8) Find the GBIF ID key and reducing taxonomic redundancy or accounting recent taxonomic changes. We also need to provide 'rank': A taxonomic rank such as species key1 <- name_suggest(q=species[1], rank='species')$key[1] key1 #[1] 2495150 <-- unique taxonomic number that can be used to search in the gbif website # For example: # http://www.gbif.org/species/2495150 (you can check this key manually in gbif) ## 9) get occurrences in R species_data <- occ_search(taxonKey=key1, hasCoordinate=TRUE, fields=c('name', 'gbifID', 'decimalLatitude', 'decimalLongitude', 'basisOfRecord', 'year', 'country', 'institutionCode', 'collectionCode', 'catalogNumber', 'verbatimLocality', 'verbatimEventDate', 'occurrenceRemarks', 'stateProvince', 'county', 'verbatimElevation', 'locality'), limit=1000, # get only 1000 records return='data') head(species_data) name basisOfRecord decimalLongitude decimalLatitude year country gbifID institutionCode 1 Struthio camelus HUMAN_OBSERVATION 35.74814 -3.56551 2014 Tanzania, United Republic of 923394342 naturgucker 2 Struthio camelus HUMAN_OBSERVATION 35.56103 -3.17274 2014 Tanzania, United Republic of 923372862 naturgucker 3 Struthio camelus HUMAN_OBSERVATION 35.04776 -2.93481 2014 Tanzania, United Republic of 923374910 naturgucker 4 Struthio camelus HUMAN_OBSERVATION 32.77582 -19.13465 2014 Zimbabwe 899952074 iNaturalist 5 Struthio camelus HUMAN_OBSERVATION 36.96805 0.03858 2014 Kenya 1024214382 iNaturalist 6 Struthio camelus HUMAN_OBSERVATION 25.95567 -24.63599 2014 Botswana 1038340462 iNaturalist catalogNumber locality collectionCode verbatimEventDate verbatimLocality stateProvince 1 323828999 Lake Manyara National Park naturgucker 2 -869612390 Ngorongoro-Krater, Kraterboden naturgucker 3 -1565621799 Serengeti Plains naturgucker 4 577792 Observations 2014-02-27 Mutare Zimbabwe 5 844302 Observations August 11, 2014 10:44 Ol Pejeta 6 1038579 Observations 2014-08-26 14:22:18 Gaborone, South-East, Botswana ## 10) prepare and plot distribution data on map # install.packages("maps") library(maps) map("world", xlim = range(species_data$decimalLongitude), ylim = range(species_data$decimalLatitude)) points(species_data$decimalLongitude, species_data$decimalLatitude, col = "blue", cex = 1.5) ########################################################################################## ################## Phylogenetic Principal Component Analyses ###################### ########################################################################################## library(ape) library(geiger) library(phytools) install.packages("psych") library(psych) install.packages("GPArotation") library("GPArotation") ## 11) we are going to use a exemplar data set from Darwin's finches data(geospiza) finches_phy <- geospiza$phy finches_data <- geospiza$dat finches_data # wingL tarsusL culmenL beakD gonysW # magnirostris 4.404200 3.038950 2.724667 2.823767 2.675983 # conirostris 4.349867 2.984200 2.654400 2.513800 2.360167 # difficilis 4.224067 2.898917 2.277183 2.011100 1.929983 # scandens 4.261222 2.929033 2.621789 2.144700 2.036944 # fortis 4.244008 2.894717 2.407025 2.362658 2.221867 # fuliginosa 4.132957 2.806514 2.094971 1.941157 1.845379 # pallida 4.265425 3.089450 2.430250 2.016350 1.949125 # fusca 3.975393 2.936536 2.051843 1.191264 1.401186 # parvulus 4.131600 2.973060 1.974420 1.873540 1.813340 # pauper 4.232500 3.035900 2.187000 2.073400 1.962100 # Pinaroloxias 4.188600 2.980200 2.311100 1.547500 1.630100 # Platyspiza 4.419686 3.270543 2.331471 2.347471 2.282443 # psittacula 4.235020 3.049120 2.259640 2.230040 2.073940 ## 12) non-phylogenetic principal component analysis # Non-Rotated Principal Components finches_non_rotated_pca <- princomp(finches_data, cor=TRUE) summary(finches_non_rotated_pca) # print components and the variance accounted for # Importance of components: # Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 # Standard deviation 1.9234840 0.9503424 0.57986608 0.23166735 0.084523804 # Proportion of Variance 0.7399581 0.1806301 0.06724893 0.01073395 0.001428855 # Cumulative Proportion 0.7399581 0.9205883 0.98783719 0.99857115 1.000000000 # Notice: You will have as many components as variables (i.e., 5 components) loadings(finches_non_rotated_pca) # pc loadings of each variable on the components # Loadings: # Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 # wingL -0.505 0.144 0.841 0.129 # tarsusL -0.261 0.903 -0.131 -0.301 # culmenL -0.430 -0.322 -0.812 -0.210 # beakD -0.491 -0.195 0.458 -0.141 -0.701 # gonysW -0.502 -0.149 0.338 -0.372 0.688 plot(finches_non_rotated_pca,type="lines") # scree plot to determine how many components are useful # Notice: Use the rule to choose principal component based on the proportion of variance # explained ~ 1.0. Similarly, check the “scree plot” and the number of components before # the "elbow” are considered those that explain most of the variance # (i.e., appears that 2 components are good enough) # Varimax Rotated Principal Components finches_varimax_pca <- principal(finches_data, nfactors=2, rotate="varimax") finches_varimax_pca # Principal Components Analysis # Call: principal(r = finches_data, nfactors = 2, rotate = "varimax") # Standardized loadings (pattern matrix) based upon correlation matrix # RC1 RC2 h2 u2 # wingL 0.85 0.49 0.96 0.038 # tarsusL 0.15 0.98 0.99 0.011 # culmenL 0.88 0.02 0.78 0.224 # beakD 0.94 0.18 0.92 0.075 # gonysW 0.95 0.23 0.95 0.049 # RC1 RC2 # SS loadings 3.31 1.29 # Proportion Var 0.66 0.26 # Cumulative Var 0.66 0.92 # Proportion Explained 0.72 0.28 # Cumulative Proportion 0.72 1.00 # Test of the hypothesis that 2 components are sufficient. # The degrees of freedom for the null model are 10 and the objective function was 7.75 # The degrees of freedom for the model are 1 and the objective function was 1.86 # The total number of observations was 13 with MLE Chi Square = 15.2 with prob < 9.7e-05 # Fit based upon off diagonal values = 0.99 # You can get the scores of each species on the two components finches_varimax_pca$scores # RC1 RC2 # magnirostris 1.89987528 0.07182417 # conirostris 1.29886059 -0.29905083 # difficilis -0.04414877 -0.69844300 # scandens 0.64690615 -0.76496493 # fortis 0.71545604 -0.94622865 # fuliginosa -0.39439635 -1.43133575 # pallida -0.12006058 0.79420851 # fusca -1.94045036 -0.39800242 # parvulus -0.98365427 0.07426226 # pauper -0.35808875 0.51428958 # Pinaroloxias -0.84030741 0.01600282 # Platyspiza 0.17552395 2.56433028 # psittacula -0.05551553 0.50310795 # Notice: You can use these scores if you want reduce the numbers of variables in your # analyses # plot scores in a bivariate scatter plot finches_varimax_pca_scores <- as.data.frame(finches_varimax_pca$scores) finches_varimax_pca_scores$names <- row.names(finches_varimax_pca_scores) plot(finches_varimax_pca_scores$RC1, finches_varimax_pca_scores$RC2, main="Scatterplot finches_varimax_pca_scores", xlab="PC1", ylab="PC2", pch=19) text(finches_varimax_pca_scores$RC1, finches_varimax_pca_scores$RC2+0.1, labels=finches_varimax_pca_scores$names, col= "blue", cex= 0.7) ## 13) phylogenetic principal component analysis ## data preparation finches_phy <- geospiza$phy finches_data <- geospiza$dat # Has to be a matrix ## analyses finches_non_rotated_ppca<- phyl.pca(finches_phy, finches_data, method="lambda", mode="cov") finches_non_rotated_ppca # Phylogenetic pca # Starndard deviations: # PC1 PC2 PC3 PC4 PC5 # 1.05465929 0.28033469 0.13637879 0.06275309 0.03932391 # Loads: # PC1 PC2 PC3 PC4 PC5 # wingL -0.9486734 -0.09458879 0.220949927 -0.163218107 0.124950837 # tarsusL -0.7638619 -0.10839839 0.629634425 0.035310984 -0.084133168 # culmenL -0.8694375 -0.49144869 -0.049894201 0.004454391 -0.006877273 # beakD -0.9869568 0.14929749 -0.039705808 -0.039285967 -0.022507739 # gonysW -0.9905593 0.11456555 0.007906274 0.070172010 0.026083118 # lambda: # [1] 0.9472762 ## get the scores finches_ppca_scores <- as.data.frame(finches_non_rotated_ppca$S) finches_ppca_scores$names <- row.names(finches_ppca_scores) # PC1 PC2 PC3 PC4 PC5 names # magnirostris -1.34885916 0.17473178 -0.167737386 0.01872918 1.856872e-02 magnirostris # conirostris -0.91504424 0.04933744 -0.169647804 -0.03320557 9.274482e-03 conirostris # difficilis -0.14139908 0.10179182 -0.131633564 -0.04942487 2.233686e-02 difficilis # scandens -0.44632331 -0.13109651 -0.179164054 -0.04266454 -6.119628e-03 scandens # fortis -0.59913245 0.19131830 -0.215032007 -0.01669147 -6.676847e-05 fortis # fuliginosa 0.06018309 0.22931815 -0.201356688 -0.03940483 7.159045e-03 fuliginosa # pallida -0.25432479 -0.04427856 0.024159312 -0.03649313 -3.519028e-02 pallida # fusca 0.84403539 -0.12504135 0.006134237 0.09897982 -2.510882e-02 fusca # parvulus 0.15069258 0.29010354 -0.018348867 -0.01629369 -3.708680e-02 parvulus # pauper -0.18449300 0.20237606 -0.001795159 -0.04879382 -3.287673e-02 pauper # Pinaroloxias 0.31985257 -0.18183426 0.007582201 -0.01433011 3.011232e-02 Pinaroloxias # Platyspiza -0.67937230 0.22622332 0.201851149 -0.03207100 5.550716e-03 Platyspiza # psittacula -0.38510862 0.22403180 -0.031168002 -0.04067315 -6.216245e-02 psittacula ## plot plot(finches_ppca_scores$PC1, finches_ppca_scores$PC2, main="Scatterplot finches_ppca_scores", xlab="PPC1", ylab="PPC2", pch=19) text(finches_ppca_scores$PC1, finches_ppca_scores$PC2+0.01, labels=finches_ppca_scores$names, col= "blue", cex= 0.7)