#acl All:read DanieleMerico:write,delete,revert = How to generate gene-set collections from Bioconductor = == Description == This script exports gene annotations relating genes to Gene Ontology, the pathway database KEGG and the protein domain resource PFAM. Use the final part of the script to export these in the GMT format, a gene-set file format that is used by the enrichment analysis tool GSEA. == Code == {{{ #!rscript numbers=off # UPDATE LIBRARIES source("http://bioconductor.org/biocLite.R") biocLite ("GO.db") biocLite ("KEGG.db") biocLite ("org.Hs.eg.db") biocLite ("PFAM.db") # LOAD LIBRARIES library ("GO.db") library ("KEGG.db") library ("PFAM.db") library ("org.Hs.eg.db") # PACKAGE GO INTO LIST # (ID: GO:xxxxx) GO_Hs_eg.ls <- as.list (org.Hs.egGO2ALLEGS) GO_Hs_ID2name.chv <- unlist (lapply (as.list (GOTERM), Term)) # PACKAGE KEGG INTO LIST # (ID: KEGG_hsa:xxxxx) # Import KEGG_Hs_eg.ls <- as.list (org.Hs.egPATH2EG) KEGG_Hs_ID2name.chv <- unlist (as.list (KEGGPATHID2NAME)) # Add Prefix to IDs and Names ztemp_names.chv <- paste ( "KEGG:", KEGG_Hs_ID2name.chv[names (KEGG_Hs_eg.ls)], sep = "" ) ztemp_IDs.chv <- paste ( "KEGG:hsa", names (KEGG_Hs_ID2name.chv[names (KEGG_Hs_eg.ls)]), sep = "" ) KEGG_Hs_ID2name.chv <- ztemp_names.chv names (KEGG_Hs_ID2name.chv) <- ztemp_IDs.chv names (KEGG_Hs_eg.ls) <- paste ( "KEGG:hsa", names (KEGG_Hs_eg.ls), sep = "" ) # PACKAGE PFAM INTO LIST # (ID: KEGG_hsa:xxxxx) # Import PFAM_eg2pfam.ls <- as.list (org.Hs.egPFAM) PFAM2names.chv <- unlist (as.list (PFAMDE)) # Change Format # from EG->PFAM_ID to PFAM_ID->EG PFAM_eg2pfam.df <- stack (PFAM_eg2pfam.ls) colnames (PFAM_eg2pfam.df) <- c ("PFAM_ID", "EG") PFAM_eg.ls <- unstack (PFAM_eg2pfam.df, form = EG ~ PFAM_ID) # Warning: some PFAM entries don't have name # some problem with the R package (?) # UNITE GS_eg.ls <- c (GO_Hs_eg.ls, KEGG_Hs_eg.ls, PFAM_eg.ls) GS_ID2name.chv <- c (GO_Hs_ID2name.chv, KEGG_Hs_ID2name.chv, PFAM2names.chv) # PACKAGE INTO GMT f.collapse <- function (input.genes) {paste (input.genes, collapse = "\t")} genes.chv <- unlist (lapply (GS_eg.ls, f.collapse)) output.chv <- paste (names (GS_eg.ls), GS_ID2name.chv[names (GS_eg.ls)], genes.chv, sep = "\t") cat (output.chv, sep = "\n", file = "GO_KEGG_PFAM_Hs_eg.GMT") }}}