Created
April 24, 2023 21:41
-
-
Save phydev/151a7ec36fbac3158655d6f1b81eb6d8 to your computer and use it in GitHub Desktop.
simulation of phenotypic models based on gene expression data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# simulation of models based on gene expression data | |
# we create two different datasets drawn from normal distributions | |
# with different mean and standard deviation | |
# we then normalize the data using quantile normalization | |
# and fit a logistic regression model with glmnet | |
# load libraries | |
library(dplyr) | |
library(glmnet) | |
library(preprocessCore) | |
# set seed | |
set.seed(123) | |
# define number of genes and samples | |
n_genes <- 5 | |
n_samples <- 100 | |
# generate development data | |
df_train <- data.frame(matrix(rnorm(n_genes*n_samples), nrow=n_samples, ncol=n_genes)) | |
# transpose data | |
df_train <- t(df_train) | |
# generate validation data | |
df_val <- data.frame(matrix(rnorm(n_genes*n_samples, mean=100, sd=10), nrow=n_samples, ncol=n_genes)) | |
# transpose data | |
df_val <- t(df_val) | |
# quantile normalization with preprocessCore | |
df_train_norm <- as.data.frame(normalize.quantiles(as.matrix(df_train))) | |
rownames(df_train_norm) <- rownames(df_train) | |
df_val_norm <- as.data.frame(normalize.quantiles(as.matrix(df_val))) | |
rownames(df_val_norm) <- rownames(df_val) | |
# tidy data | |
df_train_norm <- df_train_norm %>% t %>% as.data.frame | |
df_val_norm <- df_val_norm %>% t %>% as.data.frame | |
#hist(df_train_norm[,1]) | |
# model definition | |
df_train_norm <- df_train_norm %>% | |
mutate(LP = 0.01 * X1 + 0.034 * X2 - 0.056 * X3 - 0.012 * X4 + 0.023 * X5 + | |
rnorm(nrow(df_train_norm), mean = 0, sd = 0.01)) | |
df_val_norm <- df_val_norm %>% | |
mutate(LP = 0.01 * X1 + 0.034 * X2 - 0.056 * X3 - 0.012 * X4 + 0.023 * X5 + | |
rnorm(nrow(df_val_norm), mean = 0, sd = 0.01)) | |
df_train_norm <- df_train_norm %>% mutate(y = ifelse(LP > median(LP), 1, 0)) | |
df_val_norm <- df_val_norm %>% mutate(y = ifelse(LP > median(LP), 1, 0)) | |
df_train_norm_scaled <- df_train_norm %>% mutate(across(-c(y, LP), scale)) | |
df_val_norm_scaled <- df_val_norm %>% mutate(across(-c(y, LP), scale)) | |
model <- glmnet(x = df_train_norm[, 1:5], | |
y = df_train_norm$y, | |
family = "binomial", | |
alpha = 0.5, | |
lambda = 0.1, | |
standardize = FALSE) | |
predict(model, | |
newx = as.matrix(df_train_norm[, 1:5]), | |
type = "class") | |
predict(model, | |
newx = as.matrix(df_val_norm[, 1:5]), | |
type = "class") | |
# load library to compute AUC | |
library(pROC) | |
auc(as.vector(df_val_norm$y), as.vector(predict(model, | |
newx = as.matrix(df_val_norm[, 1:5]), | |
type = "response"))) | |
auc(as.vector(df_val_norm_scaled$y), as.vector(predict(model, | |
newx = as.matrix(df_val_norm_scaled[, 1:5]), | |
type = "response"))) | |
df_val_norm |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment