This study examined the transcriptional responses of PCB 126 in epithelial cells derived from human breast. To check the details of the study, please see Morin et al 2022
Compare control vs PCB-treated for pts 698, 1264, 1309, 1108, 1106, 1129. Sallie would like this for a publication, so this would be the highest priority.
Use paired design (this allows DEGs to be defined within each patient). However, with only 6 patients, this may pose a problem of degrees of freedom.
Therefore, it may be necessary to define DEGs by simply comparing Control as a group vs the PCB-treated group.
While it would be interesting to look at DEGs for effects of PCBs in AR vs HR, 3 patients in each would not likely be sufficient. So best to delay this.
#Data Import
#The sampleInformation is shared in the email by Joe
#The sample information file has multiple sheets.
#We will read the excel file for sheet name
sheets <- excel_sheets("docs/RNAseq-SampleSubmission b.xls")
#The sheet name sample Information will be used
sampInf <- readxl::read_excel("docs/RNAseq-SampleSubmission b.xls",
sheet = sheets[[2]],
skip = 29)
#We will edit the sample information to make it a tibble with only relevant information
sampleInfo <- sampInf %>%
na.omit %>%
dplyr::select(c("Sample Number","Sample Name")) %>%
mutate(
sample_number = `Sample Number`,
sample_code = str_remove_all(`Sample Name`, pattern = "[:alpha:]"),
treatment = str_remove_all(`Sample Name`, pattern = "[:digit:]"),
treatment = case_when(
!str_detect(treatment, "[:alpha:]") ~ "Control",
TRUE ~ treatment),
sample_name = paste(sample_code, "_", treatment, sep = "")) %>%
select(-c(`Sample Number`,`Sample Name`)) %>%
column_to_rownames(var = "sample_name")
#view the sampleInfo
datatable(sampleInfo)
#write a function to make DESeq Object
importRsem2Deseq <- function(path, sampleInfo){
#load libraries
packs <- c("tximport", "DESeq2")
lapply(packs, library, character.only = TRUE)
#get file paths
files <- list.files(path = path, pattern = "genes.results")
filepaths <- paste0(path, files)
#name the filespath with sample names
names(filepaths) <- str_remove(files, ".genes.results")
#read the data
txi.rsem <- tximport(filepaths,
type = "rsem",
txIn = FALSE,
txOut = FALSE,)
txi.rsem$length[txi.rsem$length == 0] <- 1
#the trascript length is zero in the data, which is set to 1 using the folliwng suggestion (https://support.bioconductor.org/p/84304/)
#make the parameters factors
sampleInfo$sample_code <- as.factor(sampleInfo$sample_code)
sampleInfo$treatment <- as.factor(sampleInfo$treatment)
#check if all rsem column names are in sampleinfo
if(all(as.numeric(colnames(txi.rsem$counts))==sampleInfo$sample_number)){
colnames(txi.rsem$counts) = rownames(sampleInfo)
#make a deseq dataset
dds <- DESeqDataSetFromTximport(txi.rsem,
colData = sampleInfo,
design = ~ treatment)
}else{dds <- print("Incorrect sampleInfo")}
return(dds)
}
The filenames as provided by LCScience was 1-24, but they could not
be sorted. So I changed the file name in terminal with script
(./../../../DJJ003_src/change_rsem_filename.sh
) and then in
the above function I am changing it to a name that has
tissueID_treatment each.
###Import Data
dds <- importRsem2Deseq(path = "data/rsem_quant/", sampleInfo = sampleInfo)
dds <- filterLowCounts(dds)
saveRDS(dds, file = "data/DJJ03_dds")
# keey only the samples which are required for this analysis
dds1 <- dds[, -grep("Control", colnames(dds))]
dds1$treatment <- droplevels(dds1$treatment)
dds1$treatment <- relevel(dds1$treatment, "DMSO")
dds1$sample_code <- droplevels(dds1$sample_code)
saveRDS(dds1, file = "data/dds1")
#save the unpaired DE design as another dds object
dds2 <- dds1 %>% DESeq()
res2 <- results(dds2, contrast = c("treatment", "PCB", "DMSO"), alpha = 0.05)
dge2 <- getSig(res2, padj = 0.05)
#introduce paired design
dds3 <- dds1
design(dds3) <- ~ sample_code + treatment
dds3$treatment <- relevel(dds3$treatment, "DMSO")
dds3 <- DESeq(dds3)
res3.1 <- results(dds3, contrast = c("treatment", "PCB", "DMSO"), alpha = 0.05)
res3.2 <- results(dds3, name = "treatment_PCB_vs_DMSO", alpha = 0.05)
dge3.2 <- getSig(res3.2, padj = 0.05)
res3.3 <- results(dds3, name = "treatment_PCB_vs_DMSO", alpha = 0.1)
dge3.3 <- getSig(res3.3, padj = 0.1)
vsd3 <- vst(dds3, blind = FALSE)
vsd3_a <- assay(vsd3)
dge3.2geneList <- dge3.2 %>%
as.data.frame %>%
rownames_to_column(var = "Gene") %>%
mutate(EnsemblID = str_split(Gene, "_") %>% map_chr(1),
Gene = str_split(Gene, "_") %>% map_chr(2))
dge3.2geneList <- dge3.2geneList[order(dge3.2geneList$log2FoldChange, decreasing = TRUE),]
write.csv(dge3.2geneList, file = "results/DGE_HMEC_PCB_vs_DMSO_v2.csv")
datatable(dge3.2geneList)
pcaData <- plotPCA(vsd3, intgroup=c("treatment", "sample_code"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
p1 <- ggplot(pcaData, aes(PC1, PC2, color=treatment, shape=sample_code)) +
geom_point(size=3) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
ggtitle("HMECs PCB vs DMSO paired")
coord_fixed()
## <ggproto object: Class CoordFixed, CoordCartesian, Coord, gg>
## aspect: function
## backtransform_range: function
## clip: on
## default: FALSE
## distance: function
## expand: TRUE
## is_free: function
## is_linear: function
## labels: function
## limits: list
## modify_scales: function
## range: function
## ratio: 1
## render_axis_h: function
## render_axis_v: function
## render_bg: function
## render_fg: function
## setup_data: function
## setup_layout: function
## setup_panel_guides: function
## setup_panel_params: function
## setup_params: function
## train_panel_guides: function
## transform: function
## super: <ggproto object: Class CoordFixed, CoordCartesian, Coord, gg>
pdf(file = "figures/PCA_PCB_vs_DMSO_v2.pdf", height = 6, width = 8)
p1
dev.off()
## quartz_off_screen
## 2
p1
###Show the plot for the highest log2FoldChange
plotCounts(dds1, gene= "ENSG00000140465.14_CYP1A1", intgroup="treatment")
###Show the highest expressing few genes
plot.volcano.ggplot(res3.2, title = "HBEC PCB vs DMSO", padj = 0.01, lfc = 1)
## [1] "res3.2"
###Heatmap
A heatmap of the differentially expressed genes are presented here. The variance stabilized count data is scaled and presented.
getHeatmapDJJ <- function(dge, vsd) {
mat <- twointersect(vsd, dge)
mat <- assay(vsd)%>%
as.data.frame() %>%
rownames_to_column(var = "Gene") %>%
filter(Gene %in% mat) %>%
mutate(Gene = str_split(Gene, "_") %>% map_chr(., 2)) %>% column_to_rownames(var = "Gene")
mat2 <- mat %>% t %>% scale %>% t
mat2 <- mat2[,order(colnames(mat2), decreasing = T)]
mat2 <- mat2[,order(unlist(map(colnames(mat2), reverse)),decreasing = T)]
mat2treatment <- colnames(mat2) %>% str_split(., "_") %>% map_chr(., 2)
h1 <- Heatmap(as.matrix(mat2),
top_annotation = HeatmapAnnotation(
treatment = mat2treatment,
col = list(
treatment = c("DMSO" = "Red", "PCB" = "Green"))),
cluster_columns = FALSE,
show_column_dend = FALSE,
cluster_rows = TRUE,
show_row_names = TRUE,
row_names_gp = gpar(fontsize = 6),
width = unit(10, "cm"),
height = unit(0.1 * nrow(mat2), "cm"))
return(h1)
}
#allgenes
h1 <- getHeatmapDJJ(dge = dge3.2, vsd = vsd3)
#upgegulated
dge3.2up <- dge3.2 %>% as.data.frame %>% filter(log2FoldChange > 1)
h2 <- getHeatmapDJJ(dge3.2up, vsd3)
pdf(file = "figures/Heatmap_PCB_vs_DMSO_v2.pdf", height = 11.69, width = 8.27)
h1
dev.off()
## quartz_off_screen
## 2
#TF analysis + To identify the transcription factor that might be responsible for the the expression pattern, we used the (LISA Cistrome analysis)[http://lisa.cistrome.org/]. + This takes one or two genesets as input and computes the enrichment of chromatin enrichment peaks, methylation and chromatin accessibility on these genes from publicly available dataset. + This allows the LISA Cistrome to suggest possible TFs that might be responsible for the expression pattern the provided geneset. + Following the TF analysis, we do context specific curations of the result.
lisapath <- "Lisa/"
lisafiles <- list.files(lisapath, pattern = "csv")
lisanames <- lisafiles %>% str_split(".743__") %>% map_chr(2)
lisafiles <- map(paste(lisapath,lisafiles, sep = "/"), read_csv)
names(lisafiles) <- lisanames
pcb_up <- lisafiles$hg38_gs1.txt_chipseq_cauchy_combine_dedup.csv %>%
filter(tissue %in% "Breast")
pcb_down <- lisafiles$hg38_gs2.txt_chipseq_cauchy_combine_dedup.csv %>%
filter(tissue %in% "Breast")
pcb_tf <- full_join(pcb_up, pcb_down, by = "factor") %>%
mutate(pval.x = ifelse(is.na(pval.x), 1, pval.x),
pval.y = ifelse(is.na(pval.y), 1, pval.y),
pval.xy = case_when(
pval.x == 1 ~ 1,
pval.y == 1 ~ 1,
TRUE ~ 0),
pval.xy1 = case_when(
pval.x != 1 ~ pval.x,
pval.y != 1 ~ pval.y,
TRUE ~ 1)
)
pcb_tf <- dge3.2geneList %>%
rename(factor = Gene) %>%
right_join(pcb_tf) %>%
mutate(TFcolor = case_when(log2FoldChange > 0 ~ "#e41a1c",
log2FoldChange < 0 ~ "#4daf4a",
TRUE ~ "#377eb8"))
tf1 <- ggplot(pcb_tf, aes( x = -log10(pval.x),
y = -log10(pval.y))) +
geom_point(color = pcb_tf$TFcolor) +
geom_text_repel(aes( x = -log10(pval.x),
y = -log10(pval.y),
label = ifelse(pval.xy == 1 & -log10(pval.xy1) > 2.5, factor, "")
))+
geom_text_repel(aes( x = -log10(pval.x),
y = -log10(pval.y),
label = ifelse(!is.na(log2FoldChange), factor, ""),
nudge_y = 5
))+
geom_text_repel(aes( x = -log10(pval.x),
y = -log10(pval.y),
label = ifelse(grepl("FOXM1|FOS", factor), factor, "")
))+
labs(x = "-log10(p-value) for up-regulated gene set",
y = "-log10(p-value)\n for down-regulated gene set",
title = "Transcriptional regulators and coregulators") +
theme_pubr() +
theme(plot.title = element_text(face = "bold",
hjust = 0.5))
pdf(file = "figures/TR_PCB_Up_Down_v2.pdf", width = 8, height = 5)
tf1
dev.off()
## quartz_off_screen
## 2
tf1