Introduction

This study examined the transcriptional responses of PCB 126 in epithelial cells derived from human breast. To check the details of the study, please see Morin et al 2022

Compare control vs PCB-treated for pts 698, 1264, 1309, 1108, 1106, 1129. Sallie would like this for a publication, so this would be the highest priority.
Use paired design (this allows DEGs to be defined within each patient). However, with only 6 patients, this may pose a problem of degrees of freedom.
Therefore, it may be necessary to define DEGs by simply comparing Control as a group vs the PCB-treated group.
While it would be interesting to look at DEGs for effects of PCBs in AR vs HR, 3 patients in each would not likely be sufficient. So best to delay this.

Libraries and reusable codes

#Data Import

Prepare sample Information

#The sampleInformation is shared in the email by Joe
#The sample information file has multiple sheets. 
#We will read the excel file for sheet name
sheets <- excel_sheets("docs/RNAseq-SampleSubmission b.xls")

#The sheet name sample Information will be used
sampInf <- readxl::read_excel("docs/RNAseq-SampleSubmission b.xls", 
                              sheet = sheets[[2]],
                              skip = 29)

#We will edit the sample information to make it a tibble with only relevant information
sampleInfo <- sampInf %>% 
  na.omit %>% 
  dplyr::select(c("Sample Number","Sample Name")) %>%  
  mutate(
    sample_number = `Sample Number`,
    sample_code = str_remove_all(`Sample Name`, pattern = "[:alpha:]"),
    treatment = str_remove_all(`Sample Name`, pattern = "[:digit:]"),
    treatment = case_when(
      !str_detect(treatment, "[:alpha:]") ~ "Control",
      TRUE ~ treatment),
    sample_name = paste(sample_code, "_", treatment, sep = "")) %>% 
  select(-c(`Sample Number`,`Sample Name`)) %>% 
  column_to_rownames(var = "sample_name")


#view the sampleInfo
  datatable(sampleInfo)

write a function for importing rsem_results

#write a function to make DESeq Object
importRsem2Deseq <- function(path, sampleInfo){
        #load libraries
        packs <- c("tximport", "DESeq2")
        lapply(packs, library, character.only = TRUE)
        
        #get file paths
        files <- list.files(path = path, pattern = "genes.results")
        filepaths <- paste0(path, files)
        
        #name the filespath with sample names
        names(filepaths) <- str_remove(files, ".genes.results")
        
        #read the data
        txi.rsem <- tximport(filepaths, 
                             type = "rsem", 
                             txIn = FALSE, 
                             txOut = FALSE,)
        
        txi.rsem$length[txi.rsem$length == 0] <- 1
#the trascript length is zero in the data, which is set to 1 using the folliwng suggestion (https://support.bioconductor.org/p/84304/)
        

        #make the parameters factors
        sampleInfo$sample_code <- as.factor(sampleInfo$sample_code)
        sampleInfo$treatment <- as.factor(sampleInfo$treatment) 
        #check if all rsem column names are in sampleinfo
        if(all(as.numeric(colnames(txi.rsem$counts))==sampleInfo$sample_number)){
         colnames(txi.rsem$counts) = rownames(sampleInfo)
          #make a deseq dataset
        dds <- DESeqDataSetFromTximport(txi.rsem,
                                colData = sampleInfo,
                                design = ~ treatment)
         
        }else{dds <- print("Incorrect sampleInfo")}
        
        return(dds)
        
        
}

Make Sure files are correctly read

The filenames as provided by LCScience was 1-24, but they could not be sorted. So I changed the file name in terminal with script (./../../../DJJ003_src/change_rsem_filename.sh) and then in the above function I am changing it to a name that has tissueID_treatment each.

###Import Data

dds <- importRsem2Deseq(path = "data/rsem_quant/", sampleInfo = sampleInfo)
dds <- filterLowCounts(dds)
saveRDS(dds, file = "data/DJJ03_dds")

Diffential Gene expression analysis.

# keey only the samples which are required for this analysis
dds1 <- dds[, -grep("Control", colnames(dds))]

dds1$treatment <- droplevels(dds1$treatment)
dds1$treatment <- relevel(dds1$treatment, "DMSO")
dds1$sample_code <- droplevels(dds1$sample_code)
saveRDS(dds1, file = "data/dds1")
#save the unpaired DE design as another dds object
dds2 <- dds1 %>% DESeq()
res2 <- results(dds2, contrast = c("treatment", "PCB", "DMSO"), alpha = 0.05)
dge2 <- getSig(res2, padj = 0.05)

#introduce paired design

dds3 <- dds1
design(dds3) <- ~ sample_code + treatment
dds3$treatment <- relevel(dds3$treatment, "DMSO")

dds3 <- DESeq(dds3)
res3.1 <- results(dds3, contrast = c("treatment", "PCB", "DMSO"), alpha = 0.05)
res3.2 <- results(dds3, name = "treatment_PCB_vs_DMSO", alpha = 0.05)
dge3.2 <- getSig(res3.2, padj = 0.05)
res3.3 <- results(dds3, name = "treatment_PCB_vs_DMSO", alpha = 0.1)
dge3.3 <- getSig(res3.3, padj = 0.1)
vsd3 <- vst(dds3, blind = FALSE)
vsd3_a <- assay(vsd3)

dge3.2geneList <- dge3.2 %>% 
  as.data.frame %>% 
  rownames_to_column(var = "Gene") %>% 
  mutate(EnsemblID = str_split(Gene, "_") %>% map_chr(1),
         Gene = str_split(Gene, "_") %>% map_chr(2))

dge3.2geneList <- dge3.2geneList[order(dge3.2geneList$log2FoldChange, decreasing = TRUE),]
write.csv(dge3.2geneList, file = "results/DGE_HMEC_PCB_vs_DMSO_v2.csv")

datatable(dge3.2geneList)

pcaData <- plotPCA(vsd3, intgroup=c("treatment", "sample_code"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
p1 <- ggplot(pcaData, aes(PC1, PC2, color=treatment, shape=sample_code)) +
  geom_point(size=3) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  ggtitle("HMECs PCB vs DMSO paired")
  coord_fixed()

## <ggproto object: Class CoordFixed, CoordCartesian, Coord, gg>
##     aspect: function
##     backtransform_range: function
##     clip: on
##     default: FALSE
##     distance: function
##     expand: TRUE
##     is_free: function
##     is_linear: function
##     labels: function
##     limits: list
##     modify_scales: function
##     range: function
##     ratio: 1
##     render_axis_h: function
##     render_axis_v: function
##     render_bg: function
##     render_fg: function
##     setup_data: function
##     setup_layout: function
##     setup_panel_guides: function
##     setup_panel_params: function
##     setup_params: function
##     train_panel_guides: function
##     transform: function
##     super:  <ggproto object: Class CoordFixed, CoordCartesian, Coord, gg>

pdf(file = "figures/PCA_PCB_vs_DMSO_v2.pdf", height = 6, width = 8)
p1
dev.off()

## quartz_off_screen 
##                 2

p1

###Show the plot for the highest log2FoldChange

plotCounts(dds1, gene= "ENSG00000140465.14_CYP1A1", intgroup="treatment")

###Show the highest expressing few genes

plot.volcano.ggplot(res3.2, title = "HBEC PCB vs DMSO", padj = 0.01, lfc = 1)

## [1] "res3.2"

###Heatmap

A heatmap of the differentially expressed genes are presented here. The variance stabilized count data is scaled and presented.

getHeatmapDJJ <- function(dge, vsd) {
        mat <- twointersect(vsd, dge)
        mat <- assay(vsd)%>% 
  as.data.frame() %>% 
  rownames_to_column(var = "Gene") %>% 
  filter(Gene %in% mat) %>% 
  mutate(Gene = str_split(Gene, "_") %>% map_chr(., 2)) %>% column_to_rownames(var = "Gene")
        mat2 <- mat %>% t %>% scale %>% t
        mat2 <- mat2[,order(colnames(mat2), decreasing = T)]
        mat2 <- mat2[,order(unlist(map(colnames(mat2), reverse)),decreasing = T)]
        mat2treatment <- colnames(mat2) %>% str_split(., "_") %>% map_chr(., 2)
        h1 <- Heatmap(as.matrix(mat2),
              top_annotation = HeatmapAnnotation(
                treatment = mat2treatment, 
                col = list(
                  treatment = c("DMSO" = "Red", "PCB" = "Green"))),
              cluster_columns = FALSE,
              show_column_dend = FALSE,
              cluster_rows = TRUE,
              show_row_names = TRUE,
              row_names_gp = gpar(fontsize = 6),
              width = unit(10, "cm"), 
              height = unit(0.1 * nrow(mat2), "cm"))
return(h1)
}

#allgenes
h1 <- getHeatmapDJJ(dge = dge3.2, vsd = vsd3)
#upgegulated

dge3.2up <- dge3.2 %>% as.data.frame %>% filter(log2FoldChange > 1) 
h2 <- getHeatmapDJJ(dge3.2up, vsd3)
pdf(file = "figures/Heatmap_PCB_vs_DMSO_v2.pdf", height = 11.69, width = 8.27)
h1
dev.off()

## quartz_off_screen 
##                 2

#TF analysis + To identify the transcription factor that might be responsible for the the expression pattern, we used the (LISA Cistrome analysis)[http://lisa.cistrome.org/]. + This takes one or two genesets as input and computes the enrichment of chromatin enrichment peaks, methylation and chromatin accessibility on these genes from publicly available dataset. + This allows the LISA Cistrome to suggest possible TFs that might be responsible for the expression pattern the provided geneset. + Following the TF analysis, we do context specific curations of the result.

lisapath <- "Lisa/"
lisafiles <- list.files(lisapath, pattern = "csv")
lisanames <- lisafiles %>% str_split(".743__") %>% map_chr(2)
lisafiles <- map(paste(lisapath,lisafiles, sep = "/"), read_csv)
names(lisafiles) <- lisanames

pcb_up <- lisafiles$hg38_gs1.txt_chipseq_cauchy_combine_dedup.csv %>%
        filter(tissue %in% "Breast")
pcb_down <- lisafiles$hg38_gs2.txt_chipseq_cauchy_combine_dedup.csv %>%
        filter(tissue %in% "Breast")
pcb_tf <- full_join(pcb_up, pcb_down, by = "factor") %>% 
        mutate(pval.x = ifelse(is.na(pval.x), 1, pval.x),
               pval.y = ifelse(is.na(pval.y), 1, pval.y),
               pval.xy = case_when(
                       pval.x == 1 ~ 1,
                       pval.y == 1 ~ 1,
                       TRUE ~ 0),
               pval.xy1 = case_when(
                       pval.x != 1 ~ pval.x,
                       pval.y != 1 ~ pval.y,
                       TRUE ~ 1)
               )
pcb_tf <- dge3.2geneList %>% 
        rename(factor = Gene) %>% 
        right_join(pcb_tf) %>%
        mutate(TFcolor = case_when(log2FoldChange > 0 ~ "#e41a1c",
                                   log2FoldChange < 0 ~ "#4daf4a",
                                   TRUE ~ "#377eb8"))

        tf1 <- ggplot(pcb_tf, aes( x = -log10(pval.x),
                        y = -log10(pval.y))) +
                geom_point(color = pcb_tf$TFcolor) +
                geom_text_repel(aes( x = -log10(pval.x),
                                     y = -log10(pval.y),
                                     label = ifelse(pval.xy == 1 & -log10(pval.xy1) > 2.5, factor, "")
                                     ))+
                geom_text_repel(aes( x = -log10(pval.x),
                                     y = -log10(pval.y),
                                     label = ifelse(!is.na(log2FoldChange), factor, ""),
                                     nudge_y = 5
                                     ))+
                 geom_text_repel(aes( x = -log10(pval.x),
                                     y = -log10(pval.y),
                                     label = ifelse(grepl("FOXM1|FOS", factor), factor, "")
                                     ))+
                
                labs(x = "-log10(p-value) for up-regulated gene set",
                     y = "-log10(p-value)\n for down-regulated gene set",
                     title = "Transcriptional regulators and coregulators") +
                theme_pubr() +
                theme(plot.title = element_text(face = "bold",
                                                hjust = 0.5))
pdf(file = "figures/TR_PCB_Up_Down_v2.pdf", width = 8, height = 5)
tf1
dev.off()

## quartz_off_screen 
##                 2

tf1

DJJ-003_Analyses of RoH Breast Cell

djj_003_pdm_bi_010_v1.Rmd

Prabin Dhangada Majhi