#Week (Jan 13 & 15) #We are going to use the slide number as a reference for the R-code #SLIDE 2: Reading tables #this sets the working directory where you have you input data files and where all your #output files will be placed setwd("/Users/jcsantos/Desktop/R_class_winter_2015/0_Bio559R_course_final_files/week_2/data") #this is my working directory #read a tab delimited file read.table (file="Birds_mass_BM_McNab_2009_class_tab.txt", header = TRUE, sep = "\t") #assigns table to object 'birds_MR' birds_MR <- read.table (file="Birds_mass_BM_McNab_2009_class_tab.txt", header = TRUE, sep = "\t") #prints on screen 'birds_MR' object birds_MR #review the first 6 rows head(birds_MR) #SLIDE 3: Data frames # Our input data is actually a data frame. This give you the struture str(birds_MR) #print element in row 100 and column 3 (i.e., [m,n] matrix notation) birds_MR [100,3] #print elements in row 100 birds_MR [100,] #print elements in column 3 (i.e., BMR_kJ_per_h variable) birds_MR [,3] #print elements in the first 10 rows birds_MR [1:10,] #SLIDE 4: Data frames #Accessing data frame information # dimensions of data frame in rows by column in [m,n] matrix notation dim(birds_MR) # Get the names of variables in the data frame names(birds_MR) # basic statistics of the variables in the data frame summary(birds_MR) # Dealing with missing data in data frame # rows with missing values birds_MR_incomplete_cases <- birds_MR[!complete.cases(birds_MR),] birds_MR_incomplete_cases # delete rows with incomplete data birds_MR_complete_cases <- birds_MR [complete.cases(birds_MR),] birds_MR_complete_cases # similar to process as above birds_MR_complete_cases_2 <- na.omit(birds_MR) birds_MR_complete_cases_2 #How many species were eliminated? dim(birds_MR) - dim(birds_MR_complete_cases_2) # we make use of the dim function and subtract #SLIDE 5: Data frames #Subsetting data frame into other data frames # this will select rows (species) that are both Nocturnal and tropical. # Notice the lower and upper case of the names, they need to be exact as is in the data # frame birds_nocturnal_temperate <- subset(birds_MR, Time=='Nocturnal' & Climate == 'tropical') birds_nocturnal_temperate # this will select species with more 1000 g birds_big <- subset(birds_MR, Mass_g>1000) birds_big # this will select species with more 1000 g that live in temperate climates birds_big_temperate <- subset(birds_MR, Mass_g > 1000 & Climate == 'temperate') birds_big_temperate # this will select species with more 1000 g that do not live in temperate climates # Notice the ! Indicating that we do not want the species that are temperate birds_big_not_temperate <- subset(birds_MR, Mass_g > 1000 & !Climate == 'temperate') birds_big_not_temperate # Notice the %in% that indicate ‘nested' in Climate variable states: temperate and polar birds_temperate_polar <- subset(birds_MR, Climate %in% c('temperate', 'polar')) birds_temperate_polar #SLIDE 6: Data frames #We can create new variables in data frame that include calculations between other variables in data frame birds_MR$Mass_specific_BMR_kj_per_h_g <- birds_MR$BMR_kJ_per_h/birds_MR$Mass_g head(birds_MR) #a new column with the new calculated variable will appear #ifelse function is useful (see ?ifelse). Notice its use ifelse(test, yes, no) birds_MR$size <- ifelse(birds_MR$Mass_g > 1000, "big_bird", "small_bird") #a new column with the new categorical variable will appear head(birds_MR) #Some basic statistical functions mean(birds_MR$Mass_g) # you can get this from the summary() function sd (birds_MR$Mass_g) # standard deviation min(birds_MR$Mass_g) # minimum value max(birds_MR$Mass_g) # maximum value birds_MR$log10Mass_g <- log10(birds_MR$Mass_g) #get log10 birds_MR$log10Mass_g birds_MR$log10BMR <- log10(birds_MR$BMR_kJ_per_h) #we needed to transform this variables #for log-log calculations necessary #for Mass-BMR regressions head(birds_MR) #SLIDE 7: Data frames #Some basic statistical functions: Correlations #?cor to understand the use of this function cor(birds_MR$log10Mass_g, birds_MR$log10BMR , method = c("pearson")) #?cor.test to understand the use of this function cor.test( ~ birds_MR$log10Mass_g + birds_MR$log10BMR, data=birds_MR) #SLIDE 8: Data frames #Scatter plot of our relationship between variables plot(birds_MR$log10Mass_g, birds_MR$log10BMR, main="Log-Log mass versus BMR", xlab="Log-Mass", ylab="Log-BMR", pch=19, cex.axis=1.5, cex.lab = 1.5) # regression line (y~x) abline(lm(birds_MR$log10BMR~birds_MR$log10Mass_g ), col="red", lwd = 4) # adds some text text(x= 2, y= 2, labels="Pearson's correlation r=0.97", cex = 1.5) #SLIDE 9: Other useful functions # Column selection from data frame #this selects specific columns and crates a new data frame birds_MR_based_food <- data.frame(birds_MR$Food, birds_MR$log10Mass_g, birds_MR$log10BMR) head(birds_MR_based_food) #similar results birds_MR_based_food_2 <- birds_MR [c("Food", "log10Mass_g", "log10BMR")] head(birds_MR_based_food_2) # check the last 6 rows of this data frame tail(birds_MR_based_food) #Renaming columns of the data frame #change the name of variables names(birds_MR_based_food) <- c('food', 'log10Mass','log10BMR') head(birds_MR_based_food) #Writing table to file with comma delimited columns write.table(birds_MR, file="Birds_mass_BM_McNab_2009_class_csv_updated.txt", col.names = TRUE, sep = ",") #SLIDE 10: Functions #Basic example function add_two_variables <- function (x,y) { z <- x + y z } add_two_variables(1,2) #sample of 6 random numbers from 1 to 40 a <-sample(1:40, 6) a b <- sample(1:40, 6) b #use function add_two_variables(a,b) #place results in a data frame c <- add_two_variables(a,b) results <-data.frame(a,b,c) results #Functions can be applied as loops lapply (birds_MR, sd) # list of results of applying function in this case ‘sd’ to our data frame, some warning will appear as some variables are categories #SLIDE 11: Exploring beyond… #Even a better correlation plot using ‘ggplot2’ package install.packages("ggplot2") install.packages("plyr") library(ggplot2) library(plyr) birds_MR_based_food <- data.frame(birds_MR$Food, birds_MR$log10Mass_g, birds_MR$log10BMR) head(birds_MR_based_food) names(birds_MR_based_food) <- c('food', 'log10Mass','log10BMR') #change the name of variables cor_func <- function(x) #we can build our own function to do correlations for specific groups { return(data.frame(COR = round(cor(x$log10Mass, x$log10BMR),4))) } ddply(birds_MR_based_food, .(food), cor_func) #ddply is a loop function that allows to run groups of data #based on grouping variable (food) cor_by_food <- ddply(birds_MR_based_food, .(food), cor_func) #ddply is a loop function that allows to run groups of data based on grouping variable (food) ggplot(data = birds_MR_based_food, aes(x = log10Mass, y = log10BMR, group=food, colour=food)) + #define x and y variables, grouping variable, coloring varaible geom_smooth(method = "lm", se=FALSE, aes(fill = food), formula = y ~ x) + #define the method for regression in this case lm (least square regression), color by food type geom_point(aes(shape = food, size = 2)) + annotate("text", x = 1, y = 2.5, label = "carnivore R=", color="red") + annotate("text", x = 1.5, y = 2.5, label = as.character(cor_by_food[1,2]), color="red") + annotate("text", x = 1, y = 2, label = "ominivore R=", color="darkgreen") + annotate("text", x = 1.5, y = 2, label = as.character(cor_by_food[2,2]), color="darkgreen") + annotate("text", x = 1, y = 1.5, label = "vegetarian R=", color="blue") + annotate("text", x = 1.5, y = 1.5, label = as.character(cor_by_food[3,2]), color="blue") ggsave("correlations_food.pdf", width = 16, height = 9, dpi = 120) #it will save the graph as a pdf #################################################################################### #################################################################################### #################################################################################### #################################################################################### #SLIDE 34: Getting Sequences from GenBank using R-packages # set the directory to save resulting files setwd("/Users/jcsantos/Desktop/R_class_winter_2015/0_Bio559R_course_final_files/week_2/data") #I open terminal and drag the folder to it to get the path. Then, copy and paste. #We need to install and load the following packages: install.packages("ape") install.packages("seqinr") library(ape) #this is a general R-package for phylogenetics and comparative methods library("seqinr") #this is an specialized package for nucleotide sequence management #Let’s check that our packages have been loaded correctly sessionInfo() #SLIDE 35: Getting Sequences from GenBank using R-packages #Let's use 'ape' to read the sequence from GenBank this with the function: ?read.GenBank #This function connects to the GenBank database, and reads nucleotide sequences using accession numbers given as arguments. # read.GenBank(access.nb, seq.names = access.nb, as.character = FALSE) #access.nb: a vector of mode character giving the accession numbers. #seq.names: the names to give to each sequence; by default the accession numbers. #as.character: a logical whether to return the sequences as an object "DNAbin". #Let's read the casque-headed lizard (Basiliscus basiliscus) RAG1 sequence JF806202 seq_1_DNAbin <- read.GenBank("JF806202") #save as DNAbin object: attr(seq_1_DNAbin, "species") #to get the specie name of the sequence seq_1_DNAbin$JF806202 str(seq_1_DNAbin) # we get the structure of the object seq_1_character <- read.GenBank("JF806202", as.character = TRUE)#save as character object: seq_1_character #this is not a very nice format #SLIDE 36: Read sequences using accession numbers #Create a vector of GenBank accession numbers that we want lizards_accession_numbers <- c("JF806202", "HM161150", "FJ356743", "JF806205", "JQ073190", "GU457971", "FJ356741", "JF806207", "JF806210", "AY662592", "AY662591", "FJ356748", "JN112660", "AY662594", "JN112661", "HQ876437", "HQ876434", "AY662590", "FJ356740", "JF806214", "JQ073188", "FJ356749", "JQ073189", "JF806216", "AY662598", "JN112653", "JF806204", "FJ356747", "FJ356744", "HQ876440", "JN112651", "JF806215", "JF806209") lizards_accession_numbers#create a vector a GenBank accession numbers # Get those sequences and save them in a single DNAbin object: lizards_sequences <- read.GenBank(lizards_accession_numbers) #read sequences and place them in a DNAbin object lizards_sequences #a brief summary of what is in the object, including base composition str(lizards_sequences) #a list of the DNAbin elements with length of the sequences #notice the one of the attributes is the species names #SLIDE 37: Read sequences and create a fasta file format #Lets explore more the DNAbin object: attributes(lizards_sequences) #see the list of attributes and contents names(lizards_sequences) #the accession numbers attr(lizards_sequences, "species") # we get the species list. Notice that the attr function is slightly different #However, it is hard remember which accession number corresponds to which species. #So we can use the previous information to create first a vector with such information lizards_sequences_GenBank_IDs <- paste(attr(lizards_sequences, "species"), names(lizards_sequences), sep ="_RAG1_") ## build a character vector with the species, GenBank accession numbers, and gene ## name "_RAG1_" this is its common abbreviation: recombination activating protein 1 ## notice the use of the paste function: textA, textB, textC ## results in: textAtextCtextB lizards_sequences_GenBank_IDs #a more informative vector of names for our sequences #SLIDE 38: Write a fasta file format #Let’s write sequences to a text file in fasta format using write.dna(). However, only accession numbers are included. ?write.dna # This function writes in a file a list of DNA sequences in sequential, interleaved, or FASTA format. ### we are going to write in fasta format write.dna(lizards_sequences, file ="lizard_fasta_1.fasta", format = "fasta", append = FALSE, nbcol = 6, colsep = " ", colw = 10) ########### Some relevant arguments for write.dna() #x: a list or a matrix of DNA sequences. #file: a file name specified to contain our sequences #format: Three choices are possible: "interleaved", "sequential", or "fasta", or any #unambiguous abbreviation of these. #append: a logical, if TRUE the data are appended to the file without erasing the data #possibly existing in the file, otherwise the file is overwritten (FALSE the default). #nbcol: a numeric specifying the number of columns per row (6 by default) #colsep: a character used to separate the columns (a single space by default). #colw: a numeric specifying the number of nucleotides per column (10 by default). ########### #SLIDE 40: Rewrite a fasta file format with more information # Read our fasta file using seqinr package lizard_seq_seqinr_format <- read.fasta(file = "lizard_fasta_1.fasta", seqtype = "DNA", as.string = TRUE, forceDNAtolower = FALSE) lizard_seq_seqinr_format #this shows different form to display the same sequence information # Rewrite our fasta file using the name vector that we created previously write.fasta(sequences = lizard_seq_seqinr_format, names = lizards_sequences_GenBank_IDs, nbchar = 10, file.out = "lizard_seq_seqinr_format.fasta") #Suggestion: Do not rearrange, delete or add sequenced to the fasta file, as the function will assign the names in the order provided in the file and the name vector # Let's check our new fasta file ‘lizard_seq_seqinr_format.fasta’ #SLIDE 41: Get sequences without using accession numbers # We can use a package that use an API (application programming interface) to interact with the NCBI website. # More info in: http://en.wikipedia.org/wiki/Application_programming_interface install.packages ("rentrez") library (rentrez) # Let's get some lizard sequences lizard <- "Basiliscus basiliscus[Organism]" #We want a character vector #nucleotide database (nuccore) and retmax determines the max number lizard_search <- entrez_search(db="nuccore", term=lizard, retmax=40) lizard_search lizard_search$ids #gives you the NCBI ids #gets your sequences as a character vector lizard_seqs <- entrez_fetch(db="nuccore", id=lizard_search$ids, rettype="fasta") lizard_seqs #SLIDE 42: Get sequences without using accession numbers # Lets get our Basiliscus basiliscus RAG 1 sequence Bbasiliscus_RAG1 <- "Basiliscus basiliscus[Organism] AND RAG1[Gene]" #nucleotide database (nuccore) and retmax determines no more than 10 access numbers to return Bbasiliscus_RAG1_search <- entrez_search(db="nuccore", term=Bbasiliscus_RAG1, retmax=10) Bbasiliscus_RAG1_search Bbasiliscus_RAG1_search$ids #gives you the NCBI ids Bbasiliscus_RAG1_seqs <- entrez_fetch(db="nuccore", id=Bbasiliscus_RAG1_search$ids, rettype="fasta") Bbasiliscus_RAG1_seqs #notice \n (new line) delimiter. Other common delimiters are \r (carriage return) and \t (tab). write(Bbasiliscus_RAG1_seqs, "Bbasiliscus_RAG1.fasta", sep="\n") #gets sequence to a file # We can read our fasta file using seqinr package Bbasiliscus_RAG1_seqinr_format <- read.fasta(file = "Bbasiliscus_RAG1.fasta", seqtype = "DNA", as.string = TRUE, forceDNAtolower = FALSE) Bbasiliscus_RAG1_seqinr_format # you can also check the .fasta file in the working folder #SLIDE 43: Example: Accessing Cytochrome B Sequences #We can use the ‘rentrez’ package to get lots of sequences using taxonomic classifications for specific markers Liolaemus_CYTB <- "Liolaemus[Organism] AND CYTB[Gene]" #This is a well-studied gene from this genus of South American lizards Liolaemus_CYTB_search <- entrez_search(db="nuccore", term=Liolaemus_CYTB, retmax=100) Liolaemus_CYTB_search #There are 2539 sequences that match this query # Let's adjust the search and fetch all sequences of of sequences using taxonomic classifications for specific markers Liolaemus_CYTB_search_2 <- entrez_search(db="nuccore", term=Liolaemus_CYTB, retmax=2539) Liolaemus_CYTB_search_2$ids #gives you the NCBI ids Liolaemus_CYTB_seqs <- entrez_fetch(db="nuccore", id=Liolaemus_CYTB_search_2$ids , rettype="fasta") #we get an error "client error: (414) Request-URI Too Long". We are asking too many sequences #SLIDE 44: Example: Accessing Cytochrome B Sequences # Lets adjust the search and fetch by smaller chunks so we can get the first 1500 sequences Liolaemus_CYTB_seqs_part_1 <- entrez_fetch(db="nuccore", id=Liolaemus_CYTB_search_2$ids[1:500] , rettype="fasta") Liolaemus_CYTB_seqs_part_2 <- entrez_fetch(db="nuccore", id=Liolaemus_CYTB_search_2$ids[501:1000] , rettype="fasta") Liolaemus_CYTB_seqs_part_3 <- entrez_fetch(db="nuccore", id=Liolaemus_CYTB_search_2$ids[1001:1500] , rettype="fasta") # Lets write as single file by appending all 3 chucks of sequences write(Liolaemus_CYTB_seqs_part_1, "Liolaemus_CYTB_seqs.fasta", sep="\n") #it gets the sequences to the same file by changing the logical argument of append from #the default FALSE to TRUE (i.e., can abbreviate TRUE with T or other unambiguous #abbreviation) write(Liolaemus_CYTB_seqs_part_2, "Liolaemus_CYTB_seqs.fasta", sep="\n", append = TRUE) write(Liolaemus_CYTB_seqs_part_3, "Liolaemus_CYTB_seqs.fasta", sep="\n", append = TRUE) #you will get a 1.3 Mb file with all 1500 sequences #SLIDE 45: Example: Accessing Cytochrome B Sequences #We can read our fasta file using seqinr package and rename the sequences Liolaemus_CYTB_seqs_seqinr_format <- read.fasta(file = "Liolaemus_CYTB_seqs.fasta", seqtype = "DNA", as.string = TRUE, forceDNAtolower = FALSE) Liolaemus_CYTB_seqs_seqinr_format Liolaemnus_CYTB_names <- attr(Liolaemus_CYTB_seqs_seqinr_format, "name") #eliminate characters after "." using ?gsub (Pattern Matching and Replacement) Liolaemnus_CYTB_names <- gsub("\\..*","", Liolaemnus_CYTB_names) #eliminate characters before "|" using ?gsub (Pattern Matching and Replacement) Liolaemnus_CYTB_names <- gsub("^.*\\|", "", Liolaemnus_CYTB_names) #we get only the accession numbers Liolaemnus_CYTB_names #SLIDE 46: Example: Accessing Cytochrome B Sequences # We can read our fasta file using ape package to get accession numbers and species names Liolaemus_CYTB_seqs_ape_format <- read.GenBank(Liolaemnus_CYTB_names) attr(Liolaemus_CYTB_seqs_ape_format, "species") #to get the species names of the sequence names(Liolaemus_CYTB_seqs_ape_format) # build a vector object with the species, GenBank accession numbers, and type of gene Liolaemus_CYTB_seqs_GenBank_IDs <- paste(attr(Liolaemus_CYTB_seqs_ape_format, "species"), names(Liolaemus_CYTB_seqs_ape_format), sep="_CYTB_") Liolaemus_CYTB_seqs_GenBank_IDs #vector of names to add to sequences # Read our fasta file 'Liolaemus_CYTB_seqs.fasta' using seqinr package Liolaemus_CYTB_seqs_seqinr_format <- read.fasta(file = "Liolaemus_CYTB_seqs.fasta", seqtype = "DNA", as.string = TRUE, forceDNAtolower = FALSE) # Rewrite our fasta file using the name vector that we created previously write.fasta(sequences = Liolaemus_CYTB_seqs_seqinr_format, names = Liolaemus_CYTB_seqs_GenBank_IDs, nbchar = 10, file.out = "Liolaemus_CYTB_seqs_seqinr_format.fasta")