#acl All:read DanieleMerico:write,delete,revert {{{#!rscript numbers=off # IMPORT ANNOTATION FROM LIBRARIES library (org.Hs.eg.db) Ann_eg2sy.chv <- unlist (as.list (org.Hs.egSYMBOL)) Ann_sy2eg.chv <- names (Ann_eg2sy.chv) names (Ann_sy2eg.chv) <- Ann_eg2sy.chv Ann_eg2name.chv <- unlist (as.list (org.Hs.egGENENAME)) # GENERATE DIFFERENTIAL EXPRESSION SCORES # Import expression matrix in GSEA format setwd ("C:/Users/Daniele/Documents/DM New/Università/Classes_Taught/OBI Pathway Analysis/L2_Assignment/Data_Core") expr.df <- read.table ( file = "MCF7_ExprMx_v2.txt", sep = "\t", header = T, quote = "", stringsAsFactors = F ) # Define function # the score is -log(p-value) multiplied by # (-1) for DOWN and (+1) for UP f.t_test <- function (input.nv) { t.htest <- t.test (x = input.nv[1: 3], y = input.nv[4: 6], alternative = "two.sided") logpv.n <- - log (t.htest$p.value) * sign (t.htest$statistic) return (logpv.n) } # Apply function diff_expr.mx <- matrix (ncol = 2, nrow = nrow (expr.df)) diff_expr.mx[, 1] <- apply (expr.df[, grep (colnames (expr.df), pattern = "_12h_")], 1, f.t_test) # apply (expr.df[, c ("E2_12h_01", "E2_12h_02", "E2_12h_03", "NT_12h_01", "NT_12h_02", "NT_12h_03")], 1, f.t_test) diff_expr.mx[, 2] <- apply (expr.df[, grep (colnames (expr.df), pattern = "_24h_")], 1, f.t_test) # apply (expr.df[, c ("E2_24h_01", "E2_24h_02", "E2_24h_03", "NT_24h_01", "NT_24h_02", "NT_24h_03")], 1, f.t_test) colnames (diff_expr.mx) <- c ("diff_12h", "diff_24h") rownames (diff_expr.mx) <- expr.df$NAME # Make data.frame diff_expr_1.df <- data.frame ( GeneID = rownames (diff_expr.mx), GeneSy = Ann_eg2sy.chv [as.character (expr.df$NAME)], GeneName = Ann_eg2name.chv[as.character (expr.df$NAME)], stringsAsFactors = F ) diff_expr_2.df <- as.data.frame (diff_expr.mx) diff_expr.df <- cbind (diff_expr_1.df, diff_expr_2.df) # IMPORT AND FORMAT HUMAN INTERACTION DATA # Import .sif file into data.frame int.df <- read.table ( file = "homo-sapiens.sif", sep = "\t", header = F, quote = "", stringsAsFactors = F ) colnames (int.df) <- c ("Source", "IntType", "Target") # Restrict to "INTERACTS_WITH" int_ppi.df <- int.df[int.df$IntType == "INTERACTS_WITH", ] nrow (int.df) # 245077 nrow (int_ppi.df) # 113203 # Convert to Entrez-gene int_ppi_eg.df <- int_ppi.df int_ppi_eg.df$Source <- Ann_sy2eg.chv[int_ppi.df$Source] int_ppi_eg.df$Target <- Ann_sy2eg.chv[int_ppi.df$Target] # Remove lines that were not completely converted # (unconverted values are NAs) sel.ix <- which ((! is.na (int_ppi_eg.df$Source)) & (! is.na (int_ppi_eg.df$Target))) int_ppi_eg.df <- int_ppi_eg.df[sel.ix, ] nrow (int_ppi_eg.df) # [1] 109081 # lost in conversion: 113203 - 109081 # 4122 (3.6%) # Remove duplications # 1) order pairs # 2) remove redundant pairs f.orderPairs <- function (input.chv) { output.chv <- input.chv sorted.chv <- sort (output.chv[c (1, 3)]) output.chv[1] <- sorted.chv[1] output.chv[3] <- sorted.chv[2] return (output.chv) } int_ppi_eg_unq.df <- unique ( as.data.frame ( t ( apply (int_ppi_eg.df, 1, f.orderPairs) ), stringsAsFactors = F ) ) nrow (int_ppi_eg_unq.df) # 103588 # Restrict network to functional group of interest GO.ls <- as.list (org.Hs.egGO2ALLEGS) # - Replication Fork (GO:0005657) RepF.genes <- unique (unlist (GO.ls[["GO:0005657"]])) RepF.ix <- which ((int_ppi_eg_unq.df$Source %in% RepF.genes) & (int_ppi_eg_unq.df$Target %in% RepF.genes)) int_ppi_eg_RepF.df <- int_ppi_eg_unq.df[RepF.ix, ] # 90% of Replication Fork genes # are mapped to the physical interactions network length (RepF.genes) # 30 length (unique (c (int_ppi_eg_RepF.df$Source, int_ppi_eg_RepF.df$Target))) # 27 # - DNA Replication (GO:0006260) RepDNA.genes <- unique (unlist (GO.ls[["GO:0006260"]])) RepDNA.ix <- which ((int_ppi_eg_unq.df$Source %in% RepDNA.genes) & (int_ppi_eg_unq.df$Target %in% RepDNA.genes)) int_ppi_eg_RepDNA.df <- int_ppi_eg_unq.df[RepDNA.ix, ] # 64.2% of DNA replication genes # are mapped to the physical interactions network length (RepDNA.genes) # 232 length (unique (c (int_ppi_eg_RepDNA.df$Source, int_ppi_eg_RepDNA.df$Target))) # 149 # - APC-dependent Protein Degradation (GO:0031145) # (full name: anaphase-promoting complex-dependent proteasomal # ubiquitin-dependent protein catabolic process ) Prdeg.genes <- unique (unlist (GO.ls[["GO:0031145"]])) Prdeg.ix <- which ((int_ppi_eg_unq.df$Source %in% Prdeg.genes) & (int_ppi_eg_unq.df$Target %in% Prdeg.genes)) int_ppi_eg_Prdeg.df <- int_ppi_eg_unq.df[Prdeg.ix, ] # 92% of APC-dependent Protein Degradation # are mapped to the physical interactions network length (Prdeg.genes) # 62 length (unique (c (int_ppi_eg_Prdeg.df$Source, int_ppi_eg_Prdeg.df$Target))) # 57 # WRITE TO CYTOSCAPE NETWORK FORMAT setwd ("C:/Users/Daniele/Documents/DM New/Università/Classes_Taught/OBI Pathway Analysis/L2_Assignment/Data_Prep") write.table ( diff_expr.df, sep = "\t", quote = F, col.names = T, row.names = F, file = "ES_MCF7_Diff.txt" ) write.table ( int_ppi_eg_RepF.df, sep = "\t", quote = F, col.names = T, row.names = F, file = "ppiNetw_RepF_eg.txt" ) write.table ( int_ppi_eg_RepDNA.df, sep = "\t", quote = F, col.names = T, row.names = F, file = "ppiNetw_RepDNA_eg.txt" ) write.table ( int_ppi_eg_Prdeg.df, sep = "\t", quote = F, col.names = T, row.names = F, file = "ppiNetw_Prdeg_eg.txt" ) }}}