# By: Daniele Merico (PhD), BaderLab, Donnelly CCBR, University of Toronto # GENERAL RELEASE NOTES # - Gene Ontology (GO), KEGG and PFAM from Bioconductor libraries # each species has an annotation set # - NCI from web page, in tabular format (http://pid.nci.nih.gov/download.shtml) # primary data for Hs, used orthologs for other species # - Biocarta from whichgenes (http://www.whichgenes.org/) # primary data for Hs, used orthologs for other species # * due to a quirk in the web server, this is not an updated version # - HomoloGene from # ftp://ftp.ncbi.nih.gov/pub/HomoloGene/current/ # OTHER TECHNICAL NOTES # - ID prefixes should be always fully capitalized, # as GSEA automatically converts them to capital letters # PATH path_root.ch <- "/Users/danielemerico/Documents/Research_Works/GeneSet_DB_2.0/" path_ws.ch <- paste (path_root.ch, "R_Works/R_ws/", sep = "") path_data.ch <- paste (path_root.ch, "Core_Data/20101108", sep = "") path_res.ch <- paste (path_root.ch, "Results/v02_20101108/", sep = "") # COMMON FUNCTIONS # Convert to orthologs f.ConvHom <- function (input.eg, taxon_in.id, taxon_out.id, HomGene.df) { groups.id <- HomGene.df$HomologyGroupID[HomGene.df$TaxonID == taxon_in.id & HomGene.df$EgID %in% input.eg] output.eg <- HomGene.df$EgID[HomGene.df$TaxonID == taxon_out.id & HomGene.df$HomologyGroupID %in% groups.id] return (output.eg) } # LIBRARIES # Update # (suggest running in a separate workspace) source("http://bioconductor.org/biocLite.R") biocLite ("GO.db") # v:2.4.5, packaged: 2010-09-23 biocLite ("KEGG.db") # v:2.4.5, packaged: 2010-09-23 biocLite ("PFAM.db") # v:2.4.5, packaged: 2010-09-23 biocLite ("org.Hs.eg.db") # v:2.4.6, packaged: 2010-10-04 biocLite ("org.Mm.eg.db") # v:2.4.6, packaged: 2010-10-04 biocLite ("org.Rn.eg.db") # v:2.4.6, packaged: 2010-10-04 # Load library ("GO.db") library ("KEGG.db") library ("PFAM.db") library ("org.Hs.eg.db") library ("org.Mm.eg.db") library ("org.Rn.eg.db") # IMPORT GO GO_id2eg_Hs.ls <- as.list (org.Hs.egGO2ALLEGS) GO_id2eg_Mm.ls <- as.list (org.Mm.egGO2ALLEGS) GO_id2eg_Rn.ls <- as.list (org.Rn.egGO2ALLEGS) GO_id2des.chv <- unlist (lapply (as.list (GOTERM), Term)) # Prune redundancies GO_id2eg_Hs.ls <- lapply (GO_id2eg_Hs.ls, unique) GO_id2eg_Mm.ls <- lapply (GO_id2eg_Mm.ls, unique) GO_id2eg_Rn.ls <- lapply (GO_id2eg_Rn.ls, unique) # IMPORT KEGG KEGG_id2eg_Hs.ls <- as.list (org.Hs.egPATH2EG) KEGG_id2eg_Mm.ls <- as.list (org.Mm.egPATH2EG) KEGG_id2eg_Rn.ls <- as.list (org.Rn.egPATH2EG) KEGG_id2des.chv <- unlist (as.list (KEGGPATHID2NAME)) # Add ID Prefix names (KEGG_id2des.chv) <- paste ("KEGG:", names (KEGG_id2des.chv), sep = "") names (KEGG_id2eg_Hs.ls) <- paste ("KEGG:", names (KEGG_id2eg_Hs.ls), sep = "") names (KEGG_id2eg_Mm.ls) <- paste ("KEGG:", names (KEGG_id2eg_Mm.ls), sep = "") names (KEGG_id2eg_Rn.ls) <- paste ("KEGG:", names (KEGG_id2eg_Rn.ls), sep = "") # Checks # (output should be always 0) length (setdiff (names (KEGG_id2eg_Hs.ls), names (KEGG_id2des.chv))) length (setdiff (names (KEGG_id2eg_Mm.ls), names (KEGG_id2des.chv))) length (setdiff (names (KEGG_id2eg_Rn.ls), names (KEGG_id2des.chv))) # Add Description Prefix temp.names <- names (KEGG_id2des.chv) KEGG_id2des.chv <- paste ("KEGG: ", KEGG_id2des.chv, sep = "") names (KEGG_id2des.chv) <- temp.names rm (temp.names) # IMPORT PFAM PFAM_eg2id_Hs.ls <- as.list (org.Hs.egPFAM) PFAM_eg2id_Mm.ls <- as.list (org.Mm.egPFAM) PFAM_eg2id_Rn.ls <- as.list (org.Rn.egPFAM) PFAM_id2des.chv <- unlist (as.list (PFAMDE)) # Reformat eg2id to id2eg PFAM_eg2id_Hs.ls <- lapply (PFAM_eg2id_Hs.ls, unique) PFAM_eg2id_Mm.ls <- lapply (PFAM_eg2id_Mm.ls, unique) PFAM_eg2id_Rn.ls <- lapply (PFAM_eg2id_Rn.ls, unique) f.reformat <- function (eg2id.ls) { eg2id.df <- stack (eg2id.ls) colnames (eg2id.df) <- c ("Id", "Eg") id2eg.ls <- unstack (eg2id.df, form = Eg ~ Id) return (id2eg.ls) } PFAM_id2eg_Hs.ls <- lapply (f.reformat (PFAM_eg2id_Hs.ls), unique) PFAM_id2eg_Mm.ls <- lapply (f.reformat (PFAM_eg2id_Mm.ls), unique) PFAM_id2eg_Rn.ls <- lapply (f.reformat (PFAM_eg2id_Rn.ls), unique) rm (PFAM_eg2id_Hs.ls, PFAM_eg2id_Mm.ls, PFAM_eg2id_Rn.ls) # No PFAM descriptions are missing length (setdiff (names (PFAM_id2eg_Hs.ls), names (PFAM_id2des.chv))) # [1] 0 length (setdiff (names (PFAM_id2eg_Mm.ls), names (PFAM_id2des.chv))) # [1] 0 length (setdiff (names (PFAM_id2eg_Rn.ls), names (PFAM_id2des.chv))) # [1] 0 # Add Prefix to Descriptions temp.names <- names (PFAM_id2des.chv) PFAM_id2des.chv <- paste ("PFAM: ", PFAM_id2des.chv, sep = "") names (PFAM_id2des.chv) <- temp.names rm (temp.names) # IMPORT HOMOLOGY DATA setwd (path_data.ch) HomGene.df <- read.table ( file = "Homologene_Build65_20100910.txt", header = T, sep = "\t", quote = "", stringsAsFactors = F ) colnames (HomGene.df) <- c ("HomologyGroupID", "TaxonID", "EgID", "Symbol", "ProteinGI", "ProteinAcc") # Taxa ID # - Hs: 9606 # - Mm: 10090 # - Rn: 10116 # IMPORT BIOCARTA # Read GMT Bioc_raw_Hs.chv <- scan ( file = "Biocarta_Hs_WhichGenes_20100326.GMT", what = character(), sep = "\n" ) Bioc_raw_Hs.ls <- strsplit (Bioc_raw_Hs.chv, "\t") f.extract_slotID <- function (input.chv) {return (input.chv[1])} f.extract_content <- function (input.chv) {return (setdiff (input.chv, input.chv[1]))} Bioc_id2eg_Hs.ls <- lapply (Bioc_raw_Hs.ls, f.extract_content) names (Bioc_id2eg_Hs.ls) <- unlist (lapply (Bioc_raw_Hs.ls, f.extract_slotID)) names (Bioc_id2eg_Hs.ls) <- sub ( names (Bioc_id2eg_Hs.ls), pattern = "Bioc_", replacement = "BIOC:" ) Bioc_id2des.chv <- sub ( names (Bioc_id2eg_Hs.ls), pattern = "PATHWAY", replacement = " Pathway" ) Bioc_id2des.chv <- sub ( Bioc_id2des.chv, pattern = "BIOC:", replacement = " BIOC: " ) names (Bioc_id2des.chv) <- names (Bioc_id2eg_Hs.ls) # Check length (setdiff (names (Bioc_id2eg_Hs.ls), names (Bioc_id2des.chv))) # [1] 0 # Convert to orthologs # Mouse Bioc_id2eg_Mm.ls <- lapply ( lapply ( Bioc_id2eg_Hs.ls, f.ConvHom, 9606, 10090, HomGene.df ), unique ) # Rat Bioc_id2eg_Rn.ls <- lapply ( lapply ( Bioc_id2eg_Hs.ls, f.ConvHom, 9606, 10116, HomGene.df ), unique ) # IMPORT NCI (downloaded 08/11/2010) # Load tabular format: # - protein ID # - pathway description # - pathway ID setwd (path_data.ch) # Note: externally replaced # "naïve" --> "naive" NCI_raw_Hs.df <- read.table ( file = "NCI_Hs_20101108.txt", sep = "\t", quote = "", comment.char = "", header = F, stringsAsFactors = F ) colnames (NCI_raw_Hs.df) <- c ("UniProtID", "PathwayName", "PathwayID") # Reformat to list NCI_id2up_Hs.ls <- unstack (NCI_raw_Hs.df, form = UniProtID ~ PathwayID) # Generate descriptions # and add id and description suffix NCI_id2des.chv <- paste ("NCI: ", NCI_raw_Hs.df$PathwayName, sep = "") names (NCI_id2des.chv) <- paste ("NCI:", NCI_raw_Hs.df$PathwayID, sep = "") names (NCI_id2up_Hs.ls) <- paste ("NCI:", names (NCI_id2up_Hs.ls), sep = "") # Check length (setdiff (names (NCI_id2up_Hs.ls), names (NCI_id2des.chv))) # Convert UniProt to Eg Conv_eg2uniprot.df <- stack (as.list (org.Hs.egUNIPROT)) colnames (Conv_eg2uniprot.df) <- c ("UniProt", "Eg") f.ConvUniprot2Eg <- function (Uniprot.chv) { matching.ix <- Conv_eg2uniprot.df$UniProt %in% Uniprot.chv Eg.chv <- unique (as.character (Conv_eg2uniprot.df$Eg[matching.ix])) return (Eg.chv) } NCI_id2eg_Hs.ls <- lapply (NCI_id2up_Hs.ls, f.ConvUniprot2Eg) # Remove intermediate objects rm (NCI_raw_Hs.df) # Convert to orthologs # Mouse NCI_id2eg_Mm.ls <- lapply ( lapply ( NCI_id2eg_Hs.ls, f.ConvHom, 9606, 10090, HomGene.df ), unique ) # Rat NCI_id2eg_Rn.ls <- lapply ( lapply ( NCI_id2eg_Hs.ls, f.ConvHom, 9606, 10116, HomGene.df ), unique ) # Capitalize IDs names (NCI_id2eg_Hs.ls) <- toupper (names (NCI_id2eg_Hs.ls)) names (NCI_id2eg_Mm.ls) <- toupper (names (NCI_id2eg_Mm.ls)) names (NCI_id2eg_Rn.ls) <- toupper (names (NCI_id2eg_Rn.ls)) names (NCI_id2des.chv) <- toupper (names (NCI_id2des.chv)) # Check length (setdiff (names (NCI_id2eg_Hs.ls), names (NCI_id2des.chv))) # MERGE U_id2eg_Hs.ls <- c (GO_id2eg_Hs.ls, KEGG_id2eg_Hs.ls, NCI_id2eg_Hs.ls, Bioc_id2eg_Hs.ls, PFAM_id2eg_Hs.ls) U_id2eg_Mm.ls <- c (GO_id2eg_Mm.ls, KEGG_id2eg_Mm.ls, NCI_id2eg_Mm.ls, Bioc_id2eg_Mm.ls, PFAM_id2eg_Mm.ls) U_id2eg_Rn.ls <- c (GO_id2eg_Rn.ls, KEGG_id2eg_Rn.ls, NCI_id2eg_Rn.ls, Bioc_id2eg_Rn.ls, PFAM_id2eg_Rn.ls) U_id2des.chv <- c (GO_id2des.chv, KEGG_id2des.chv, NCI_id2des.chv, Bioc_id2des.chv, PFAM_id2des.chv) # Check length (setdiff (names (U_id2eg_Hs.ls), names (U_id2des.chv))) length (setdiff (names (U_id2eg_Mm.ls), names (U_id2des.chv))) length (setdiff (names (U_id2eg_Rn.ls), names (U_id2des.chv))) # PACKAGE INTO GMT f.collapse <- function (input.genes) {paste (input.genes, collapse = "\t")} f.PackGMT <- function (id2eg.ls, id2des.chv, file.name) { genes.chv <- unlist (lapply (id2eg.ls, f.collapse)) output.chv <- paste (names (id2eg.ls), id2des.chv[names (id2eg.ls)], genes.chv, sep = "\t") cat (output.chv, sep = "\n", file = file.name) } setwd (path_res.ch) f.PackGMT ( id2eg.ls = U_id2eg_Hs.ls, id2des.chv = U_id2des.chv, file.name = "GO_K_NCI_BIOC_PF_Hs_eg.GMT" ) f.PackGMT ( id2eg.ls = U_id2eg_Mm.ls, id2des.chv = U_id2des.chv, file.name = "GO_K_NCI_BIOC_PF_Mm_eg.GMT" ) f.PackGMT ( id2eg.ls = U_id2eg_Rn.ls, id2des.chv = U_id2des.chv, file.name = "GO_K_NCI_BIOC_PF_Rn_eg.GMT" ) # FILTER len_U.nv <- sapply (U_id2eg_Hs.ls, f.ulen) U_id2eg_Hs_f10_1200.ls <- U_id2eg_Hs.ls[len_U.nv >= 10 & len_U.nv <= 1200] # SAVE RDATA setwd (path_ws.ch) save (U_id2eg_Hs.ls, U_id2des.chv, file = "Res02_1_U_Hs_nf.RData") save (U_id2eg_Hs_f10_1200.ls, U_id2des.chv, file = "Res02_2_U_Hs_f10_1200.RData")