########################################################################################## ############################# Script 1 ############################# ########################################################################################## ########### This is a job script #!/bin/bash #PBS -S /bin/bash #PBS -N Name #PBS -r n #PBS -l nodes=1:ppn=24,mem=100gb,walltime=24:00:00 #PBS -m bea echo Running on host `hostname` echo Time is `date` perl snowhite_2.0.3.pl -f RAW_READS_PAIR_1.fastq -v Illumina_adapters_list.txt -Q 20 -D T -d 0.01 -L T -p 12 -Y T -l 6 -a B -t B -b 6 -r 10 -i 20 -k T -n T -w T -m 50 -g T -R T -o READS_PAIR_1_fastq_clean ########################################################################################## ############################# Script 2 ############################# ########################################################################################## ########### This is a perl script ################# FQ_pair_HWI.pl #################### # Modified from version provided by Kay Hodgins # # # Assumes: fasta file # # Run: perl FQ_pair_HWI.pl # # This script reads through fastq files and checks that sequences are present in both read directions. # For any sequence that is present in both directions, the Fastq files are written to new fastq file # called paried_oldfilename. Currently, there is nothing done with the "orphans" present in only one file or the other. # # ######################################################## # perl FQ_pair_HWI.pl EtricolorHead_900MB_R1_Block_1_fastq_clean.clean EtricolorHead_900MB_R2_Block_1_fastq_clean.clean #!usr/bin/perl use strict; use warnings; #enter directory for with files for assembly my $dir=$ARGV[0]; my $dir2=$ARGV[1]; open (IN, "<$dir") || die "$_ not found.\n"; open (IN2, "<$dir2") || die "$_ not found.\n"; open (OUT, ">paired_$dir"); open (OUT2, ">paired_$dir2"); my $s=0; my @temp_array=(); my @ind_array=(); my %hash = (); my $key=(); my @split=(); my %hash2 = (); while() { chomp $_; if ($s==0 && /^\@HWI-ST/){ push( @ind_array , $_); @split=split; $key=$split[0]; $s=1; }elsif ($s==1 && $_ !~/^\+$/){ push(@temp_array, $_); #print "$_\n"; }elsif (/^\+$/){ $s=2; push( @ind_array , (join "" , @temp_array)); @temp_array = (); #print "+\n"; }elsif ($_ !~/^\@HWI-ST/ && $s==2){ push(@temp_array, $_); #print "2"; }elsif (/^\@HWI-ST/ && $s==2) { push(@ind_array, (join "" , @temp_array)); @temp_array=(); push (@{$hash{$key}}, [@ind_array]); @ind_array=(); @split=split; $key=$split[0]; $s=1; push( @ind_array , $_); } else{next;} } push(@ind_array, (join "" , @temp_array)); push (@{$hash{$key}}, [@ind_array]); $s=0; @temp_array=(); @ind_array=(); $key=(); @split=(); while() { chomp $_; if ($s==0 && /^\@HWI-ST/){ push( @ind_array , $_); @split=split; $key=$split[0]; $s=1; }elsif ($s==1 && $_ !~/^\+$/){ push(@temp_array, $_); #print "$_\n"; }elsif (/^\+$/){ $s=2; push( @ind_array , (join "" , @temp_array)); @temp_array = (); #print "+\n"; }elsif ($_ !~/^\@HWI-ST/ && $s==2){ push(@temp_array, $_); #print "2"; }elsif (/^\@HWI-ST/ && $s==2) { push(@ind_array, (join "" , @temp_array)); @temp_array=(); push (@{$hash2{$key}}, [@ind_array]); @ind_array=(); @split=split; $key=$split[0]; $s=1; push( @ind_array , $_); } else{next;} } push(@ind_array, (join "" , @temp_array)); push (@{$hash2{$key}}, [@ind_array]); #for $_ ( keys %hash ) { #print "$_\n"; #print "hash $hash{$_}[0][0]\n"; #print "hash $hash{$_}[0][1]\n"; #print "hash $hash{$_}[0][2]\n"; # print "\n"; #} #for $_ ( keys %hash2 ) { #print "$_\n"; #print "hash $hash2{$_}[0][0]\n"; #print "hash $hash2{$_}[0][1]\n"; #print "hash $hash2{$_}[0][2]\n"; # print "\n"; #} foreach $key (keys %hash){ if (exists $hash2{$key}){ print OUT "$hash{$key}[0][0]\n"; print OUT "$hash{$key}[0][1]\n"; print OUT "+\n"; print OUT "$hash{$key}[0][2]\n"; print OUT2 "$hash2{$key}[0][0]\n"; print OUT2 "$hash2{$key}[0][1]\n"; print OUT2 "+\n"; print OUT2 "$hash2{$key}[0][2]\n"; } } ########################################################################################## ############################# Script 3 ############################# ########################################################################################## ########### This is a job script #!/bin/bash #SBATCH --ntasks=48 #SBATCH --mem=250G # memory pool for all cores #SBATCH --time=3-00:00:00 echo Running on host `hostname` echo Time is `date` module load samtools/1.2 export PATH=$PATH:/bowtie-1.0.1 export PATH=$PATH:/trinityrnaseq-2.0.6/ # Set the stack size to unlimited ulimit -s unlimited Trinity --seqType fq --max_memory 250G --left paired_1_SnoWhite_R1_clean.fastq.clean --right paired_1_SnoWhite_R2_clean.fastq.clean --CPU 48 --SS_lib_type RF --min_contig_length 300 --output trinity_out --verbose exit 0 ########################################################################################## ############################# Script 4 ############################# ########################################################################################## ########### This is a job script #!/bin/bash #SBATCH --ntasks=24 #SBATCH --mem-per-cpu=20G # memory per CPU core #SBATCH --time=7-00:00:00 #SBATCH -a 1-13 # use arrays to split the problem LIBRARY="PATH to reference library for blastx, e.g., uniprot_db" cd example/$SLURM_ARRAY_TASK_ID export PATH=$PATH:/ncbi-blast-2.3.0+/bin ulimit -s unlimited blastx -query example/$SLURM_ARRAY_TASK_ID/example$SLURM_ARRAY_TASK_ID.fasta -db $LIBRARY -num_threads 24 -max_target_seqs 1 -evalue 1e-20 -outfmt "6 std qseqid sscinames stitle sblastnames" -out example_$SLURM_ARRAY_TASK_ID_blastx.txt ########################################################################################## ############################# Script 5 ############################# ########################################################################################## ########### example of perl script that retrieves specific transcript IDs for Apicomplexa ########### user can change the terms to be retrieved to any other symbiont #! /usr/bin/perl use warnings; use strict; my $tissue = 'Species_Liver '; my $file='Species_Liver_uniprot_blastx.outfmt6';#needs to be saves as unix #the name of the blastx file my $outfile = '1_Apicomplexa.out'; #the name of the outfile that will collect the filtered lines open my $in, "<", $file or die "Could not open log file. $!\n"; open my $out, ">", $outfile or die "Could not open log file. $!\n"; while(<$in>) { print $out $_ if /Plasmodium/ || /Eimeria/ || /Cryptosporidium/ || /Toxoplasma/; #between / and / you put the name of the keywords that you want print in the output. } close $out; # use as perl X_parasites_Uniprot_Apicomplexa.pl my $line = 0; my $file2 = '1_Apicomplexa.out'; my $outfile2 = '1_Apicomplexa_Species_Liver.out'; open my $in2, "<", $file2 or die "Could not open log file. $!\n"; open my $out2, ">>", $outfile2 or die "Could not open log file. $!\n"; while (my $line = <$in2>) { print $out2 "Apicomplexa "; print $out2 $tissue; print $out2 $line; } unlink $file2; close $out2; ########################################################################################## ############################# Script 6 ############################# ########################################################################################## # One-liner to delete unwanted text and retain only the conting name to be used # by the pyfasta script sed 's/ len=.*$//g' transcriptome.fasta > transcriptome_modified.fasta ########################################################################################## ############################# Script 7 ############################# ########################################################################################## # Retrive the conting from symbiont given a list of conting names # Needs the folowing two text files # 1) list_of_conting to extract: list_of_conting.txt comp101365_c0_seq1 comp11657_c0_seq1 comp11673_c0_seq1 comp11681_c0_seq1 comp11919_c0_seq1 comp143221_c0_seq1 # 2) modified fasta file: transcriptome_modified.fasta >comp101365_c0_seq1 CAAGAGACAAACTCTGCTCATTATTCAGAAAGAGACACACAACAATTGAAGCCTTTGCTGATGTAA... >comp153004_c0_seq1 ACCCAGTGACACCATTGAGAATGTTAAGGCTAAGATACAAGACAAGGAAGGGATTCCTCCAGATCA... >comp84180_c0_seq1 CAACCTGCTGGGCAAGTTCGAACTTGTCGGCATTCCACCTGCACCTCGAGGTGTTCCTCAAATCGA... >comp232109_c0_seq1 CATTAGCATAGGAAACGAAAGATTTAGATGTCCAGAAGCTCTATTCCAGCCGTCATTCGTCGGTAT... >comp11657_c0_seq1 GCAAAATATGGGGTCACCAAGTTTAGCGACCTTACAGATGAAGAATTCACAAGAAATAGCTTGACT... # example of the use of pyfasta pyfasta extract --header --fasta transcriptome_modified.fasta --file list_of_conting.txt > transcriptome_parasite.fasta ########################################################################################## ############################# Script 8 ############################# ########################################################################################## ########################### These are R functions ##### install bioLite # source("http://bioconductor.org/biocLite.R") # biocLite() # biocLite("GO.db") ## required libraries library(data.table) library(plyr) library(GO.db) ########################################################################################## #####%%% key subset the GO.bd to get the TERM and ONTOLOGY as data.frame %%%##### goterm_master <- select(GO.db, keys(GO.db, "GOID"), c("TERM", "ONTOLOGY")) ########################################################################################## ########################## GO children terms ############################### GO_order_children <- function(node_value , ontology_value ) { ## input from user node <- node_value ontology <- ontology_value ### if (ontology == "BP") GOCHILDREN <- GOBPCHILDREN if (ontology == "CC") GOCHILDREN <- GOCCCHILDREN if (ontology == "MF") GOCHILDREN <- GOMFCHILDREN parents <- node # initialize output out <- c(parents) # do the following until there are no more parents while (any(!is.na(parents))) { # Get the unique children of the parents (that aren't NA) children <- unique(unlist(mget(parents[!is.na(parents)], GOCHILDREN))) # append chldren to beginning of `out` # unique will keep the first instance of a duplicate # (i.e. the most recent child is kept) out <- unique(append(children[!is.na(children)], out)) # children become the parents of the next generation parents <- children } return(out) } ######################## END of function ########################### GO_order_children(node_value = "GO:0003674", ontology_value = "MF") ########################################################################################## ############################# Script 9 ############################# ########################################################################################## ## requires an input text file with the GO terms such as: # "all_parasites_main_file.txt" UniProt_Code Genus Species Host_Tissue Transcript_contig True_sequence_lenght_bp Alignment_length_bp Type_non_amphibian_organism NCBI_species_reference NCBI_BLAST_e_value NCBI_percent_identity NCBI_alignment_length NCBI_mismatches NCBI_gap_opens NCBI_bit_score Protein_product Protein_Names GO_Biological_Process GO_Molecular_Function. GO_Cellular_Component GO_IDs Cross_Reference_InterPro Cross_Reference_PANTHER tissue length effective_length expected_count TPM FPKM P40285 Epipedobates boulengeri Gut comp250737_c0_seq1 395 129 Kinetoplastida Leishmania infantum 3.00E-28 82.09 67 11 1 106 Histone H3 Histone H3 DNA binding nucleosome nucleus GO:0003677 GO:0000786 GO:0005634 IPR009072IPR007125IPR000164 PTHR11426 EbouAraG 395 222.52 1 331.81 776.97 Q9UAL5 Epipedobates anthonyi Gut comp333692_c0_seq1 312 446 Apicomplexa Plasmodium falciparum 2.00E-38 62.14 103 39 0 138 Enolase Enolase (EC 4.2.1.11) (2-phospho-D-glycerate hydro-lyase) (2-phosphoglycerate dehydratase) glycolytic process phosphopyruvate hydratase activity magnesium ion binding phosphopyruvate hydratase activity phosphopyruvate hydratase complex GO:0006096 GO:0000287 GO:0004634 GO:0000015 IPR000941IPR020810IPR029065IPR020809IPR020811IPR029017 PTHR11902 EantStPG 312 119.48 1 598.96 1392.43 ########################### These are R functions ##### install bioLite # source("http://bioconductor.org/biocLite.R") # biocLite() # biocLite("GO.db") ## required libraries library(data.table) library(plyr) library(GO.db) ########################################################################################## #####%%% key subset the GO.bd to get the TERM and ONTOLOGY as data.frame %%%##### goterm_master <- select(GO.db, keys(GO.db, "GOID"), c("TERM", "ONTOLOGY")) ########################################################################################## ########################## GO parent terms ############################### GO_order_parent <- function(goterm_list_value, node_value, ontology_value, heriarchy_level_below_root_value = 2) { ## input from user center_node <- node_value ontology <- ontology_value goterm_list <- goterm_list_value heriarchy_level_below_root <- heriarchy_level_below_root_value ### looking for parents if (ontology == "BP") GOPARENTS <- GOBPPARENTS if (ontology == "CC") GOPARENTS <- GOCCPARENTS if (ontology == "MF") GOPARENTS <- GOMFPARENTS # initialize output center_node_out <- c(center_node) # start node hierarchy_node <- center_node_out # start list and add first element as center node result_GO_list <- list() result_GO_list[[1]] <- center_node_out # initiate the counter counter <- 1 ######## while loop to get all parents to root while (!any(hierarchy_node == "all")) { # counter starts counter <- sum(counter, 1) # for parent nodes hierarchy_node <- unique(unlist(mget(hierarchy_node, GOPARENTS))) # add parent nodes to list result_GO_list[[counter]] <- unique(hierarchy_node) } ## select level to get the terminology GO_level_selected <- length(result_GO_list) - heriarchy_level_below_root selected_GO_level_term <- result_GO_list[[GO_level_selected]] ## select only one GO term if more than one if(length(selected_GO_level_term) > 1) { selected_GO_level_term <- selected_GO_level_term[1] } # get the term value GO_term_result <- goterm_list[goterm_list[, "GOID"] == selected_GO_level_term,] # select the highest row per each element return (GO_term_result) } ######################## END of function ########################### species_GO_term <- GO_order_parent (goterm_list_value = goterm_master, node_value = "GO:0003677", ontology_value = "MF", heriarchy_level_below_root_value = 2) ########################################################################################## ########################## GO parent terms ############################### retrive_parent_GO_terms_values <- function (master_data_table_value, goterm_base_value, ontology_value, heriarchy_level_value, outfile_name_value) { # user input master_table <- master_data_table_value goterm_base <- goterm_base_value selected_ontology <- ontology_value heriarchy_level <- heriarchy_level_value outfile_name <- outfile_name_value # read tables master_df <- read.table(master_table, header = T, sep = "\t", stringsAsFactors = F) # master_df <- master_df[1:6,] # create master list to save results master_list_parent <- list() ################### loop to all rows in the data base #################### for (i_row in 1:nrow(master_df)) { # get row GO_row_number <- master_df[i_row,] # GO_row_number <- master_df[127,] # GO_row_number <- master_df[1,] # GO_row_number <- master_df[152,] GO_species_row <- GO_row_number$GO_IDs ############## loop : determine if GO_species_row has no GOIDs ################## if (GO_species_row == "") { print(paste("At least one row has not a GO term for the selected ontology = ", selected_ontology, sep = "")) master_list_parent[[i_row]] <- data.frame(GOID= "NA", TERM= "NA", ONTOLOGY = "NA", CN_GOID= "NA", CN_TERM= "NA", CN_ONTOLOGY = "NA") } else { # get element and split if multiple GO_IDs GO_species_row_terms <- unlist(strsplit(GO_species_row," GO:")) GO_species_row_terms <-gsub("GO:", "", GO_species_row_terms) GO_species_row_terms <-paste("GO:",GO_species_row_terms, sep="") ## search GO term database GO_species_row_list <- list() for(i_GOID in 1:length(GO_species_row_terms)){ GO_species_row_list[[i_GOID]] <- goterm_base[goterm_base[, "GOID"] == GO_species_row_terms[i_GOID],] } GO_species_row_df <- do.call(rbind, GO_species_row_list) ## separate by ontology # selected_ontology <- "BP" # selected_ontology <- "CC" if (selected_ontology == "MF") { GO_species_row_ontology <- GO_species_row_df[GO_species_row_df[, "ONTOLOGY"] == "MF",]} if (selected_ontology == "CC") { GO_species_row_ontology <- GO_species_row_df[GO_species_row_df[, "ONTOLOGY"] == "CC",]} if (selected_ontology == "BP") { GO_species_row_ontology <- GO_species_row_df[GO_species_row_df[, "ONTOLOGY"] == "BP",]} ## select only the desidered ontology if(nrow(GO_species_row_ontology) == 0 ) { center_node_GO_info_NA_parent_df <- GO_species_row_df[1,] names(center_node_GO_info_NA_parent_df) <- c("CN_GOID", "CN_TERM", "CN_ONTOLOGY") parent_NA_GO_info <- data.frame(GOID= "NA", TERM= "NA", ONTOLOGY = "NA") final_row_parent_NA_GO_term <- cbind(parent_NA_GO_info, center_node_GO_info_NA_parent_df) master_list_parent[[i_row]] <- final_row_parent_NA_GO_term ################################# nrow(GO_species_row_ontology) > 1 ################## } else if(nrow(GO_species_row_ontology) == 1 ) { search_ontology <- GO_species_row_ontology ## center node information center_node_GO_info_df <- search_ontology names(center_node_GO_info_df) <- c("CN_GOID", "CN_TERM", "CN_ONTOLOGY") ## searchable GO term row_GO_term_to_search <- search_ontology$GOID ## do GO term on ontology search row_GO_term_parent <- try(GO_order_parent (goterm_list_value = goterm_master, node_value = row_GO_term_to_search, ontology_value = selected_ontology, heriarchy_level_below_root_value = heriarchy_level), silent = F) ## if lower level not available if(class(row_GO_term_parent) == "try-error") { print(paste("At least one GO term has not a GO term parent at the selected heriarchy level = ", heriarchy_level, sep = "")) row_GO_term_parent <- GO_species_row_ontology[1,] } ## merge parent and center node final_row_GO_term <- cbind(row_GO_term_parent, center_node_GO_info_df) ## get values as data fram master_list_parent[[i_row]] <- final_row_GO_term ################################# nrow(GO_species_row_ontology) > 1 ################## } else if(nrow(GO_species_row_ontology) > 1 ) { search_ontology <- GO_species_row_ontology[1,] ## center node information center_node_GO_info_df <- search_ontology names(center_node_GO_info_df) <- c("CN_GOID", "CN_TERM", "CN_ONTOLOGY") ## searchable GO term row_GO_term_to_search <- search_ontology$GOID ## do GO term on ontology search row_GO_term_parent <- try(GO_order_parent (goterm_list_value = goterm_master, node_value = row_GO_term_to_search, ontology_value = selected_ontology, heriarchy_level_below_root_value = heriarchy_level), silent = TRUE) ## if lower level not available if(class(row_GO_term_parent) == "try-error") { print(paste("At least one GO term has not a GO term parent at the selected heriarchy level = ", heriarchy_level, sep = "")) row_GO_term_parent <- GO_species_row_ontology[1,] } ## merge parent and center node final_row_GO_term <- cbind(row_GO_term_parent, center_node_GO_info_df) ## get values as data fram master_list_parent[[i_row]] <- final_row_GO_term } ################### END loop to all rows in the data base #################### } } ## bind results to a df master_list_parent_df <- do.call(rbind, master_list_parent) ## write dataframe to table write.table(master_list_parent_df, file=paste(outfile_name, "_GO_ontology_", selected_ontology, "_level_", heriarchy_level, ".txt", sep=""), sep = "\t", row.names = F) ## return results return(master_list_parent_df) } ######################## END of function ########################### species_MF_level_2_parents <- retrive_parent_GO_terms_values (master_data_table_value = "all_parasites_main_file.txt", goterm_base_value = goterm_master, ontology_value = "MF", heriarchy_level_value = 2, outfile_name_value = "all_parasites") species_BP_level_3_parents <- retrive_parent_GO_terms_values (master_data_table_value = "all_parasites_main_file.txt", goterm_base_value = goterm_master, ontology_value = "BP", heriarchy_level_value = 3, outfile_name_value = "all_parasites") species_CC_level_4_parents <- retrive_parent_GO_terms_values (master_data_table_value = "all_parasites_main_file.txt", goterm_base_value = goterm_master, ontology_value = "CC", heriarchy_level_value = 4, outfile_name_value = "all_parasites")