Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 65 additions & 72 deletions R/transformTFIDF.R
Original file line number Diff line number Diff line change
@@ -1,77 +1,72 @@
#' Transform/normalize compartment calls using TF-IDF
#'
#' @name transformTFIDF
#'
#' @param obj n x p input matrix (n = samples/cells; p = compartments)
#' @details
#' This function and its helpers were modeled after or taken from:
#' - http://andrewjohnhill.com/images/posts/2019-5-6-dimensionality-reduction-for-scatac-data/analysis.html
#' - https://divingintogeneticsandgenomics.rbind.io/post/clustering-scatacseq-data-the-tf-idf-way/
#'
#' @param mat n x p input matrix (n = samples/cells; p = compartments)
#' @param scale.factor Scaling factor for the term-frequency (TF)
#'
#' @return A TF-IDF transformed matrix of the same dimensions as the input
#'
#' @import Matrix
#'
#' @examples
#'
#' m <- 1000
#' n <- 100
#' mat <- round(matrix(runif(m*n), m, n))
#' #Input needs to be a tall matrix
#' mat <- round(matrix(runif(m * n), m, n))
#' # Input needs to be a tall matrix
#' tfidf <- transformTFIDF(mat)
#'
#'
#' @export
transformTFIDF <- function(obj, scale.factor = 1e5) {
#this filters using TF-IDF on a *matrix* object
if (!is(obj, "matrix") & !is(obj, "Matrix")) {
transformTFIDF <- function(mat, scale.factor = 1e5) {
if (!is(mat, "matrix") & !is(mat, "Matrix")) {
stop("Input needs to be a matrix.")
}
#
# FIXME: cite the following in the docs
#
#the following code was modeled after or taken from:
#http://andrewjohnhill.com/images/posts/2019-5-6-dimensionality-reduction-for-scatac-data/analysis.html
#https://divingintogeneticsandgenomics.rbind.io/post/clustering-scatacseq-data-the-tf-idf-way/
#binarize the matrix
#this assumes n x p matrix (e.g. a wide matrix)
#check and transpose as needed
#input matrix is tall
if (dim(obj)[1] > dim(obj)[2]) obj <- t(obj)
#make sparse
obj.binary <- Matrix(.binarizeMatrix(t(obj)), sparse = TRUE)
#compute term-frequency
tf <- t(t(obj.binary) / Matrix::colSums(obj.binary))
#scale
tf@x <- log1p(tf@x * scale.factor)
#inverse-document frequency smooth
idf <- log(1 + ncol(obj.binary) / Matrix::rowSums(obj.binary))
#transform
tfidf <- .tfidf(tf, idf)
#cast back to a matrix since things like UMAP don't like sparse matrices

# binarize the matrix
# this assumes n x p matrix (e.g. a wide matrix)
# check and transpose as needed
# input matrix is tall
if (dim(mat)[1] > dim(mat)[2]) mat <- t(mat)

# make sparse
mat.binary <- Matrix(.binarizeMatrix(t(mat)), sparse = TRUE)

tf <- t(t(mat.binary) / Matrix::colSums(mat.binary)) # compute term-frequency
tf@x <- log1p(tf@x * scale.factor) # scale
idf <- log(1 + ncol(mat.binary) / Matrix::rowSums(mat.binary)) # inverse-document frequency smooth
tfidf <- .tfidf(tf, idf) # transform

# cast back to a matrix since things like UMAP don't like sparse matrices
tfidf <- as.matrix(tfidf)
return(t(tfidf))
}


#helper function
.binarizeMatrix <- function(obj) {
#set positive values to 1 and negative to 0
#open chromatin in atac or RNA is 1
#closed chromatin in bisulfite or arrays is 1
#just associate 1 with signal
obj[obj > 0] <- 1
obj[obj < 0] <- 0
return(obj)
# helper function
# set positive values to 1 and negative to 0
# open chromatin in atac or RNA is 1
# closed chromatin in bisulfite or arrays is 1
# just associate 1 with signal
.binarizeMatrix <- function(mat) {
mat[mat > 0] <- 1
mat[mat < 0] <- 0
return(mat)
}

#helper function for TF-IDF transform
#modeled after http://andrewjohnhill.com/images/posts/2019-5-6-dimensionality-reduction-for-scatac-data/analysis.html
# helper function for TF-IDF transform
# modeled after http://andrewjohnhill.com/images/posts/2019-5-6-dimensionality-reduction-for-scatac-data/analysis.html
.tfidf <- function(tf, idf) {
tf = t(tf)
tf <- t(tf)
tf@x <- tf@x * rep.int(idf, diff(tf@p))
tf = t(tf)
tf <- t(tf)
return(tf)
}

#' Transform/normalize compartment calls using TF-IDF on HDF5-backed objects
#'
#'
#' @name hdf5TFIDF
#'
#' @param h5 SummarizedExperiment object, DelayedMatrix, or a normal matrix
Expand All @@ -87,57 +82,55 @@ transformTFIDF <- function(obj, scale.factor = 1e5) {
#' @importFrom methods as is
#'
#' @examples
#'
#'
#' m <- 1000
#' n <- 100
#' mat <- round(matrix(runif(m*n), m, n))
#' #Input needs to be a tall matrix
#' mat <- round(matrix(runif(m * n), m, n))
#' # Input needs to be a tall matrix
#' tfidf <- hdf5TFIDF(mat)
#'
#'
#' @export
hdf5TFIDF <- function(h5, scale.factor = 1e5,
return.dense = FALSE,
return.se = FALSE) {
#binarze
# binarze
if (is(h5, "SummarizedExperiment")) {
if (!is(assay(h5), "DelayedMatrix")) {
#coerce to hdf5 backing to work with any SE
assay(h5) <- as(assay(h5), "HDF5Matrix")
assay(h5) <- as(assay(h5), "HDF5Matrix") # coerce to hdf5 backing to work with any SE
}
assay(h5)[assay(h5) > 0] <- 1
#tall matrix
h5.mat <- assay(h5)
h5.mat <- assay(h5) # tall matrix
}

if (is(h5, "DelayedMatrix")) {
#make the matrix tall if needed
# make the matrix tall if needed
if (dim(h5)[1] < dim(h5)[2]) h5 <- t(h5)
h5[h5 > 0] <- 1
h5.mat <- h5
}

if (is(h5, "matrix")) {
h5[h5 > 0] <- 1
h5.mat <- as(h5, "HDF5Matrix")
}
#term frequency

message("Computing term frequency.")
tf <- t(t(h5.mat)/DelayedMatrixStats::colSums2(h5.mat))
#scale
tf <- log1p(tf * scale.factor)
#inverse document frequency
tf <- t(t(h5.mat) / DelayedMatrixStats::colSums2(h5.mat)) # term frequency
tf <- log1p(tf * scale.factor) # scale

message("Computing inverse document frequency.")
idf <- log(1 + ncol(h5.mat)/DelayedMatrixStats::rowSums2(h5.mat))
#cast the tf matrix back to a sparse matrix
#TODO: fix this ugliness...
idf <- log(1 + ncol(h5.mat) / DelayedMatrixStats::rowSums2(h5.mat)) # inverse document frequency

# TODO: fix this ugliness...
tf.mat <- as.matrix(tf)
tf.sparse <- Matrix(tf.mat, sparse = TRUE)
#transpose for TF-IDF
tf.sparse <- t(tf.sparse)
#TF-IDF applied
tf.sparse <- Matrix(tf.mat, sparse = TRUE) # cast the tf matrix back to a sparse matrix
tf.sparse <- t(tf.sparse) # transpose for TF-IDF

message("TF-IDF")
tf.sparse@x <- tf.sparse@x * rep.int(idf, diff(tf.sparse@p))
#transpose again
tf.sparse <- t(tf.sparse)
#coerce back to dense matrix
tf.sparse@x <- tf.sparse@x * rep.int(idf, diff(tf.sparse@p)) # TF-IDF applied
tf.sparse <- t(tf.sparse) # transpose again

# coerce back to dense matrix
if (return.dense) {
message("WARNING: This might blow up!")
message("If you get a cholmod error: problem too large, set return.dense to FALSE.")
Expand Down
4 changes: 2 additions & 2 deletions man/hdf5TFIDF.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 11 additions & 5 deletions man/transformTFIDF.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.