# IMPORT ANNOTATION FROM LIBRARIES

library (org.Hs.eg.db)

Ann_eg2sy.chv <- unlist (as.list (org.Hs.egSYMBOL))

Ann_sy2eg.chv <- names (Ann_eg2sy.chv)
names (Ann_sy2eg.chv) <- Ann_eg2sy.chv 

Ann_eg2name.chv <- unlist (as.list (org.Hs.egGENENAME))

# GENERATE DIFFERENTIAL EXPRESSION SCORES

# Import expression matrix in GSEA format

setwd ("C:/Users/Daniele/Documents/DM New/Università/Classes_Taught/OBI Pathway Analysis/L2_Assignment/Data_Core")

expr.df <- read.table (
        file   = "MCF7_ExprMx_v2.txt",
        sep    = "\t",
        header = T,
        quote  = "",
        stringsAsFactors = F
        )

# Define function       
#   the score is -log(p-value) multiplied by
#   (-1) for DOWN and (+1) for UP
        
f.t_test <- function (input.nv)
        {
        t.htest <- t.test (x = input.nv[1: 3], y = input.nv[4: 6], alternative = "two.sided")
        logpv.n <- - log (t.htest$p.value) * sign (t.htest$statistic)
        return (logpv.n)
        }

# Apply function        
        
diff_expr.mx <- matrix (ncol = 2, nrow = nrow (expr.df))        
diff_expr.mx[, 1] <- apply (expr.df[, grep (colnames (expr.df), pattern = "_12h_")], 1, f.t_test)       
# apply (expr.df[, c ("E2_12h_01", "E2_12h_02", "E2_12h_03", "NT_12h_01", "NT_12h_02", "NT_12h_03")], 1, f.t_test)
diff_expr.mx[, 2] <- apply (expr.df[, grep (colnames (expr.df), pattern = "_24h_")], 1, f.t_test)       
# apply (expr.df[, c ("E2_24h_01", "E2_24h_02", "E2_24h_03", "NT_24h_01", "NT_24h_02", "NT_24h_03")], 1, f.t_test)
colnames (diff_expr.mx) <- c ("diff_12h", "diff_24h")
rownames (diff_expr.mx) <- expr.df$NAME

# Make data.frame

diff_expr_1.df <- data.frame (
                        GeneID   = rownames (diff_expr.mx), 
                        GeneSy   = Ann_eg2sy.chv  [as.character (expr.df$NAME)],
                        GeneName = Ann_eg2name.chv[as.character (expr.df$NAME)], 
                        stringsAsFactors = F
                        )
diff_expr_2.df <- as.data.frame (diff_expr.mx)
diff_expr.df <- cbind (diff_expr_1.df, diff_expr_2.df)

# IMPORT AND FORMAT HUMAN INTERACTION DATA

# Import .sif file into data.frame

int.df <- read.table (
        file   = "homo-sapiens.sif",
        sep    = "\t",
        header = F,
        quote  = "",
        stringsAsFactors = F
        )
colnames (int.df) <- c ("Source", "IntType", "Target")
        
# Restrict to "INTERACTS_WITH"

int_ppi.df <- int.df[int.df$IntType == "INTERACTS_WITH", ]      

nrow (int.df)
# 245077
nrow (int_ppi.df)
# 113203

# Convert to Entrez-gene

int_ppi_eg.df <- int_ppi.df

int_ppi_eg.df$Source <- Ann_sy2eg.chv[int_ppi.df$Source]
int_ppi_eg.df$Target <- Ann_sy2eg.chv[int_ppi.df$Target]

# Remove lines that were not completely converted
# (unconverted values are NAs)

sel.ix <- which ((! is.na (int_ppi_eg.df$Source)) & (! is.na (int_ppi_eg.df$Target)))
int_ppi_eg.df <- int_ppi_eg.df[sel.ix, ]

nrow (int_ppi_eg.df)
# [1] 109081

# lost in conversion:
113203 - 109081
# 4122 (3.6%)

# Remove duplications
# 1) order pairs
# 2) remove redundant pairs

f.orderPairs <- function (input.chv)
        {
        output.chv <- input.chv
        sorted.chv <- sort (output.chv[c (1, 3)])
        output.chv[1] <- sorted.chv[1]
        output.chv[3] <- sorted.chv[2]
        return (output.chv)
        }

int_ppi_eg_unq.df <- unique (
                        as.data.frame (
                                t (
                                apply (int_ppi_eg.df, 1, f.orderPairs)
                                   ),
                                stringsAsFactors = F
                                )
                        )

nrow (int_ppi_eg_unq.df)
# 103588

# Restrict network to functional group of interest

GO.ls <- as.list (org.Hs.egGO2ALLEGS)

# - Replication Fork (GO:0005657)

RepF.genes <- unique (unlist (GO.ls[["GO:0005657"]]))

RepF.ix <- which ((int_ppi_eg_unq.df$Source %in% RepF.genes) & (int_ppi_eg_unq.df$Target %in% RepF.genes))
int_ppi_eg_RepF.df <- int_ppi_eg_unq.df[RepF.ix, ]

# 90% of Replication Fork genes 
# are mapped to the physical interactions network
length (RepF.genes)
# 30
length (unique (c (int_ppi_eg_RepF.df$Source, int_ppi_eg_RepF.df$Target)))
# 27

# - DNA Replication (GO:0006260)

RepDNA.genes <- unique (unlist (GO.ls[["GO:0006260"]]))

RepDNA.ix <- which ((int_ppi_eg_unq.df$Source %in% RepDNA.genes) & (int_ppi_eg_unq.df$Target %in% RepDNA.genes))
int_ppi_eg_RepDNA.df <- int_ppi_eg_unq.df[RepDNA.ix, ]

# 64.2% of DNA replication genes
# are mapped to the physical interactions network
length (RepDNA.genes)
# 232
length (unique (c (int_ppi_eg_RepDNA.df$Source, int_ppi_eg_RepDNA.df$Target)))
# 149

# - APC-dependent Protein Degradation (GO:0031145)
#   (full name: anaphase-promoting complex-dependent proteasomal 
#               ubiquitin-dependent protein catabolic process )

Prdeg.genes <- unique (unlist (GO.ls[["GO:0031145"]]))

Prdeg.ix <- which ((int_ppi_eg_unq.df$Source %in% Prdeg.genes) & (int_ppi_eg_unq.df$Target %in% Prdeg.genes))
int_ppi_eg_Prdeg.df <- int_ppi_eg_unq.df[Prdeg.ix, ]

# 92% of APC-dependent Protein Degradation
# are mapped to the physical interactions network
length (Prdeg.genes)
# 62
length (unique (c (int_ppi_eg_Prdeg.df$Source, int_ppi_eg_Prdeg.df$Target)))
# 57

# WRITE TO CYTOSCAPE NETWORK FORMAT

setwd ("C:/Users/Daniele/Documents/DM New/Università/Classes_Taught/OBI Pathway Analysis/L2_Assignment/Data_Prep")

write.table (
        diff_expr.df,
        sep   = "\t",
        quote = F,
        col.names = T,
        row.names = F,
        file  = "ES_MCF7_Diff.txt"
        )

write.table (
        int_ppi_eg_RepF.df,
        sep   = "\t",
        quote = F,
        col.names = T,
        row.names = F,
        file  = "ppiNetw_RepF_eg.txt"
        )

write.table (
        int_ppi_eg_RepDNA.df,
        sep   = "\t",
        quote = F,
        col.names = T,
        row.names = F,
        file  = "ppiNetw_RepDNA_eg.txt"
        )
        
write.table (
        int_ppi_eg_Prdeg.df,
        sep   = "\t",
        quote = F,
        col.names = T,
        row.names = F,
        file  = "ppiNetw_Prdeg_eg.txt"
        )

DanieleMerico/Code/NetworkGexpr_01 (last edited 2010-04-06 17:14:44 by DanieleMerico)

MoinMoin Appliance - Powered by TurnKey Linux