Introduction

This study examined the transcriptional responses of PCB 126 in epithelial cells derived from human breast. To check the details of the study, please see Morin et al 2022

  • Compare control vs PCB-treated for pts 698, 1264, 1309, 1108, 1106, 1129. Sallie would like this for a publication, so this would be the highest priority.

  • Use paired design (this allows DEGs to be defined within each patient). However, with only 6 patients, this may pose a problem of degrees of freedom.

  • Therefore, it may be necessary to define DEGs by simply comparing Control as a group vs the PCB-treated group.

  • While it would be interesting to look at DEGs for effects of PCBs in AR vs HR, 3 patients in each would not likely be sufficient. So best to delay this.

Libraries and reusable codes

#Data Import

Prepare sample Information

#The sampleInformation is shared in the email by Joe
#The sample information file has multiple sheets. 
#We will read the excel file for sheet name
sheets <- excel_sheets("docs/RNAseq-SampleSubmission b.xls")

#The sheet name sample Information will be used
sampInf <- readxl::read_excel("docs/RNAseq-SampleSubmission b.xls", 
                              sheet = sheets[[2]],
                              skip = 29)

#We will edit the sample information to make it a tibble with only relevant information
sampleInfo <- sampInf %>% 
  na.omit %>% 
  dplyr::select(c("Sample Number","Sample Name")) %>%  
  mutate(
    sample_number = `Sample Number`,
    sample_code = str_remove_all(`Sample Name`, pattern = "[:alpha:]"),
    treatment = str_remove_all(`Sample Name`, pattern = "[:digit:]"),
    treatment = case_when(
      !str_detect(treatment, "[:alpha:]") ~ "Control",
      TRUE ~ treatment),
    sample_name = paste(sample_code, "_", treatment, sep = "")) %>% 
  select(-c(`Sample Number`,`Sample Name`)) %>% 
  column_to_rownames(var = "sample_name")


#view the sampleInfo
  datatable(sampleInfo)

write a function for importing rsem_results

#write a function to make DESeq Object
importRsem2Deseq <- function(path, sampleInfo){
        #load libraries
        packs <- c("tximport", "DESeq2")
        lapply(packs, library, character.only = TRUE)
        
        #get file paths
        files <- list.files(path = path, pattern = "genes.results")
        filepaths <- paste0(path, files)
        
        #name the filespath with sample names
        names(filepaths) <- str_remove(files, ".genes.results")
        
        #read the data
        txi.rsem <- tximport(filepaths, 
                             type = "rsem", 
                             txIn = FALSE, 
                             txOut = FALSE,)
        
        txi.rsem$length[txi.rsem$length == 0] <- 1
#the trascript length is zero in the data, which is set to 1 using the folliwng suggestion (https://support.bioconductor.org/p/84304/)
        

        #make the parameters factors
        sampleInfo$sample_code <- as.factor(sampleInfo$sample_code)
        sampleInfo$treatment <- as.factor(sampleInfo$treatment) 
        #check if all rsem column names are in sampleinfo
        if(all(as.numeric(colnames(txi.rsem$counts))==sampleInfo$sample_number)){
         colnames(txi.rsem$counts) = rownames(sampleInfo)
          #make a deseq dataset
        dds <- DESeqDataSetFromTximport(txi.rsem,
                                colData = sampleInfo,
                                design = ~ treatment)
         
        }else{dds <- print("Incorrect sampleInfo")}
        
        return(dds)
        
        
}

Make Sure files are correctly read

The filenames as provided by LCScience was 1-24, but they could not be sorted. So I changed the file name in terminal with script (./../../../DJJ003_src/change_rsem_filename.sh) and then in the above function I am changing it to a name that has tissueID_treatment each.

###Import Data

dds <- importRsem2Deseq(path = "data/rsem_quant/", sampleInfo = sampleInfo)
dds <- filterLowCounts(dds)
saveRDS(dds, file = "data/DJJ03_dds")

Diffential Gene expression analysis.

# keey only the samples which are required for this analysis
dds1 <- dds[, -grep("Control", colnames(dds))]

dds1$treatment <- droplevels(dds1$treatment)
dds1$treatment <- relevel(dds1$treatment, "DMSO")
dds1$sample_code <- droplevels(dds1$sample_code)
saveRDS(dds1, file = "data/dds1")
#save the unpaired DE design as another dds object
dds2 <- dds1 %>% DESeq()
res2 <- results(dds2, contrast = c("treatment", "PCB", "DMSO"), alpha = 0.05)
dge2 <- getSig(res2, padj = 0.05)

#introduce paired design

dds3 <- dds1
design(dds3) <- ~ sample_code + treatment
dds3$treatment <- relevel(dds3$treatment, "DMSO")

dds3 <- DESeq(dds3)
res3.1 <- results(dds3, contrast = c("treatment", "PCB", "DMSO"), alpha = 0.05)
res3.2 <- results(dds3, name = "treatment_PCB_vs_DMSO", alpha = 0.05)
dge3.2 <- getSig(res3.2, padj = 0.05)
res3.3 <- results(dds3, name = "treatment_PCB_vs_DMSO", alpha = 0.1)
dge3.3 <- getSig(res3.3, padj = 0.1)
vsd3 <- vst(dds3, blind = FALSE)
vsd3_a <- assay(vsd3)

dge3.2geneList <- dge3.2 %>% 
  as.data.frame %>% 
  rownames_to_column(var = "Gene") %>% 
  mutate(EnsemblID = str_split(Gene, "_") %>% map_chr(1),
         Gene = str_split(Gene, "_") %>% map_chr(2))

dge3.2geneList <- dge3.2geneList[order(dge3.2geneList$log2FoldChange, decreasing = TRUE),]
write.csv(dge3.2geneList, file = "results/DGE_HMEC_PCB_vs_DMSO_v2.csv")

datatable(dge3.2geneList)
pcaData <- plotPCA(vsd3, intgroup=c("treatment", "sample_code"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
p1 <- ggplot(pcaData, aes(PC1, PC2, color=treatment, shape=sample_code)) +
  geom_point(size=3) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  ggtitle("HMECs PCB vs DMSO paired")
  coord_fixed()
## <ggproto object: Class CoordFixed, CoordCartesian, Coord, gg>
##     aspect: function
##     backtransform_range: function
##     clip: on
##     default: FALSE
##     distance: function
##     expand: TRUE
##     is_free: function
##     is_linear: function
##     labels: function
##     limits: list
##     modify_scales: function
##     range: function
##     ratio: 1
##     render_axis_h: function
##     render_axis_v: function
##     render_bg: function
##     render_fg: function
##     setup_data: function
##     setup_layout: function
##     setup_panel_guides: function
##     setup_panel_params: function
##     setup_params: function
##     train_panel_guides: function
##     transform: function
##     super:  <ggproto object: Class CoordFixed, CoordCartesian, Coord, gg>
pdf(file = "figures/PCA_PCB_vs_DMSO_v2.pdf", height = 6, width = 8)
p1
dev.off()
## quartz_off_screen 
##                 2
p1

###Show the plot for the highest log2FoldChange

plotCounts(dds1, gene= "ENSG00000140465.14_CYP1A1", intgroup="treatment")

###Show the highest expressing few genes

plot.volcano.ggplot(res3.2, title = "HBEC PCB vs DMSO", padj = 0.01, lfc = 1)
## [1] "res3.2"

###Heatmap

A heatmap of the differentially expressed genes are presented here. The variance stabilized count data is scaled and presented.

getHeatmapDJJ <- function(dge, vsd) {
        mat <- twointersect(vsd, dge)
        mat <- assay(vsd)%>% 
  as.data.frame() %>% 
  rownames_to_column(var = "Gene") %>% 
  filter(Gene %in% mat) %>% 
  mutate(Gene = str_split(Gene, "_") %>% map_chr(., 2)) %>% column_to_rownames(var = "Gene")
        mat2 <- mat %>% t %>% scale %>% t
        mat2 <- mat2[,order(colnames(mat2), decreasing = T)]
        mat2 <- mat2[,order(unlist(map(colnames(mat2), reverse)),decreasing = T)]
        mat2treatment <- colnames(mat2) %>% str_split(., "_") %>% map_chr(., 2)
        h1 <- Heatmap(as.matrix(mat2),
              top_annotation = HeatmapAnnotation(
                treatment = mat2treatment, 
                col = list(
                  treatment = c("DMSO" = "Red", "PCB" = "Green"))),
              cluster_columns = FALSE,
              show_column_dend = FALSE,
              cluster_rows = TRUE,
              show_row_names = TRUE,
              row_names_gp = gpar(fontsize = 6),
              width = unit(10, "cm"), 
              height = unit(0.1 * nrow(mat2), "cm"))
return(h1)
}

#allgenes
h1 <- getHeatmapDJJ(dge = dge3.2, vsd = vsd3)
#upgegulated

dge3.2up <- dge3.2 %>% as.data.frame %>% filter(log2FoldChange > 1) 
h2 <- getHeatmapDJJ(dge3.2up, vsd3)
pdf(file = "figures/Heatmap_PCB_vs_DMSO_v2.pdf", height = 11.69, width = 8.27)
h1
dev.off()
## quartz_off_screen 
##                 2

#TF analysis + To identify the transcription factor that might be responsible for the the expression pattern, we used the (LISA Cistrome analysis)[http://lisa.cistrome.org/]. + This takes one or two genesets as input and computes the enrichment of chromatin enrichment peaks, methylation and chromatin accessibility on these genes from publicly available dataset. + This allows the LISA Cistrome to suggest possible TFs that might be responsible for the expression pattern the provided geneset. + Following the TF analysis, we do context specific curations of the result.

lisapath <- "Lisa/"
lisafiles <- list.files(lisapath, pattern = "csv")
lisanames <- lisafiles %>% str_split(".743__") %>% map_chr(2)
lisafiles <- map(paste(lisapath,lisafiles, sep = "/"), read_csv)
names(lisafiles) <- lisanames

pcb_up <- lisafiles$hg38_gs1.txt_chipseq_cauchy_combine_dedup.csv %>%
        filter(tissue %in% "Breast")
pcb_down <- lisafiles$hg38_gs2.txt_chipseq_cauchy_combine_dedup.csv %>%
        filter(tissue %in% "Breast")
pcb_tf <- full_join(pcb_up, pcb_down, by = "factor") %>% 
        mutate(pval.x = ifelse(is.na(pval.x), 1, pval.x),
               pval.y = ifelse(is.na(pval.y), 1, pval.y),
               pval.xy = case_when(
                       pval.x == 1 ~ 1,
                       pval.y == 1 ~ 1,
                       TRUE ~ 0),
               pval.xy1 = case_when(
                       pval.x != 1 ~ pval.x,
                       pval.y != 1 ~ pval.y,
                       TRUE ~ 1)
               )
pcb_tf <- dge3.2geneList %>% 
        rename(factor = Gene) %>% 
        right_join(pcb_tf) %>%
        mutate(TFcolor = case_when(log2FoldChange > 0 ~ "#e41a1c",
                                   log2FoldChange < 0 ~ "#4daf4a",
                                   TRUE ~ "#377eb8"))

        tf1 <- ggplot(pcb_tf, aes( x = -log10(pval.x),
                        y = -log10(pval.y))) +
                geom_point(color = pcb_tf$TFcolor) +
                geom_text_repel(aes( x = -log10(pval.x),
                                     y = -log10(pval.y),
                                     label = ifelse(pval.xy == 1 & -log10(pval.xy1) > 2.5, factor, "")
                                     ))+
                geom_text_repel(aes( x = -log10(pval.x),
                                     y = -log10(pval.y),
                                     label = ifelse(!is.na(log2FoldChange), factor, ""),
                                     nudge_y = 5
                                     ))+
                 geom_text_repel(aes( x = -log10(pval.x),
                                     y = -log10(pval.y),
                                     label = ifelse(grepl("FOXM1|FOS", factor), factor, "")
                                     ))+
                
                labs(x = "-log10(p-value) for up-regulated gene set",
                     y = "-log10(p-value)\n for down-regulated gene set",
                     title = "Transcriptional regulators and coregulators") +
                theme_pubr() +
                theme(plot.title = element_text(face = "bold",
                                                hjust = 0.5))
pdf(file = "figures/TR_PCB_Up_Down_v2.pdf", width = 8, height = 5)
tf1
dev.off()
## quartz_off_screen 
##                 2
tf1