phydev/gene_expression_simulation.R

## gene_expression_simulation.R
# simulation of models based on gene expression data
# we create two different datasets drawn from normal distributions
# with different mean and standard deviation
# we then normalize the data using quantile normalization
# and fit a logistic regression model with glmnet

# load libraries
library(dplyr)
library(glmnet)
library(preprocessCore)

# set seed
set.seed(123)

# define number of genes and samples
n_genes <- 5
n_samples <- 100

# generate development data
df_train <- data.frame(matrix(rnorm(n_genes*n_samples), nrow=n_samples, ncol=n_genes))

# transpose data
df_train <- t(df_train)


# generate validation data
df_val <- data.frame(matrix(rnorm(n_genes*n_samples, mean=100, sd=10), nrow=n_samples, ncol=n_genes))

# transpose data
df_val <- t(df_val)

# quantile normalization with preprocessCore
df_train_norm <- as.data.frame(normalize.quantiles(as.matrix(df_train)))
rownames(df_train_norm) <- rownames(df_train)

df_val_norm <- as.data.frame(normalize.quantiles(as.matrix(df_val)))
rownames(df_val_norm) <- rownames(df_val)

# tidy data
df_train_norm <- df_train_norm %>% t %>% as.data.frame
df_val_norm <- df_val_norm %>% t %>% as.data.frame

#hist(df_train_norm[,1])

# model definition
df_train_norm <- df_train_norm %>%
    mutate(LP = 0.01 * X1 + 0.034 * X2 - 0.056 * X3 - 0.012 * X4 + 0.023 * X5 +
    rnorm(nrow(df_train_norm), mean = 0, sd = 0.01))

df_val_norm <- df_val_norm %>%
    mutate(LP = 0.01 * X1 + 0.034 * X2 - 0.056 * X3 - 0.012 * X4 + 0.023 * X5 +
    rnorm(nrow(df_val_norm), mean = 0, sd = 0.01))

df_train_norm <- df_train_norm %>% mutate(y = ifelse(LP > median(LP), 1, 0))
df_val_norm <- df_val_norm %>% mutate(y = ifelse(LP > median(LP), 1, 0))

df_train_norm_scaled <- df_train_norm %>% mutate(across(-c(y, LP), scale))
df_val_norm_scaled <- df_val_norm %>% mutate(across(-c(y, LP), scale))

model <- glmnet(x = df_train_norm[, 1:5],
                y = df_train_norm$y,
                family = "binomial",
                alpha = 0.5,
                lambda = 0.1,
                standardize = FALSE)

predict(model,
        newx = as.matrix(df_train_norm[, 1:5]),
        type = "class")


predict(model,
        newx = as.matrix(df_val_norm[, 1:5]),
        type = "class")

# load library to compute AUC
library(pROC)

auc(as.vector(df_val_norm$y), as.vector(predict(model,
                           newx = as.matrix(df_val_norm[, 1:5]),
                           type = "response")))

auc(as.vector(df_val_norm_scaled$y), as.vector(predict(model,
                           newx = as.matrix(df_val_norm_scaled[, 1:5]),
                           type = "response")))
df_val_norm
	# simulation of models based on gene expression data
	# we create two different datasets drawn from normal distributions
	# with different mean and standard deviation
	# we then normalize the data using quantile normalization
	# and fit a logistic regression model with glmnet

	# load libraries
	library(dplyr)
	library(glmnet)
	library(preprocessCore)

	# set seed
	set.seed(123)

	# define number of genes and samples
	n_genes <- 5
	n_samples <- 100

	# generate development data
	df_train <- data.frame(matrix(rnorm(n_genes*n_samples), nrow=n_samples, ncol=n_genes))

	# transpose data
	df_train <- t(df_train)


	# generate validation data
	df_val <- data.frame(matrix(rnorm(n_genes*n_samples, mean=100, sd=10), nrow=n_samples, ncol=n_genes))

	# transpose data
	df_val <- t(df_val)

	# quantile normalization with preprocessCore
	df_train_norm <- as.data.frame(normalize.quantiles(as.matrix(df_train)))
	rownames(df_train_norm) <- rownames(df_train)

	df_val_norm <- as.data.frame(normalize.quantiles(as.matrix(df_val)))
	rownames(df_val_norm) <- rownames(df_val)

	# tidy data
	df_train_norm <- df_train_norm %>% t %>% as.data.frame
	df_val_norm <- df_val_norm %>% t %>% as.data.frame

	#hist(df_train_norm[,1])

	# model definition
	df_train_norm <- df_train_norm %>%
	mutate(LP = 0.01 * X1 + 0.034 * X2 - 0.056 * X3 - 0.012 * X4 + 0.023 * X5 +
	rnorm(nrow(df_train_norm), mean = 0, sd = 0.01))

	df_val_norm <- df_val_norm %>%
	mutate(LP = 0.01 * X1 + 0.034 * X2 - 0.056 * X3 - 0.012 * X4 + 0.023 * X5 +
	rnorm(nrow(df_val_norm), mean = 0, sd = 0.01))

	df_train_norm <- df_train_norm %>% mutate(y = ifelse(LP > median(LP), 1, 0))
	df_val_norm <- df_val_norm %>% mutate(y = ifelse(LP > median(LP), 1, 0))

	df_train_norm_scaled <- df_train_norm %>% mutate(across(-c(y, LP), scale))
	df_val_norm_scaled <- df_val_norm %>% mutate(across(-c(y, LP), scale))

	model <- glmnet(x = df_train_norm[, 1:5],
	y = df_train_norm$y,
	family = "binomial",
	alpha = 0.5,
	lambda = 0.1,
	standardize = FALSE)

	predict(model,
	newx = as.matrix(df_train_norm[, 1:5]),
	type = "class")


	predict(model,
	newx = as.matrix(df_val_norm[, 1:5]),
	type = "class")

	# load library to compute AUC
	library(pROC)

	auc(as.vector(df_val_norm$y), as.vector(predict(model,
	newx = as.matrix(df_val_norm[, 1:5]),
	type = "response")))

	auc(as.vector(df_val_norm_scaled$y), as.vector(predict(model,
	newx = as.matrix(df_val_norm_scaled[, 1:5]),
	type = "response")))
	df_val_norm