Introduction

Sometimes it is needed to visualize a matrix to see sparsity or compare a different kind of ordering. In this post, I will show you a solution with geom_raster() in ggplot2. It is similar that Matlab spy() function. geom_raster() is a special case of geom_tile() where tiles are the same size.

Create a matrix

This chunk will create a 10×6 weighted matrix of x1 to x10 and y1 to y6.

A <- matrix(c(2 ,   0   ,   0   ,   0   ,   0   ,   2   ,
              0 ,   0   ,   0   ,   0   ,   2   ,   2   ,
              0 ,   0   ,   0   ,   0   ,   3   ,   2   ,
              0 ,   0   ,   0   ,   0   ,   0   ,   6   ,
              2 ,   2   ,   3   ,   5   ,   4   ,   2   ,
              6 ,   4   ,   3   ,   0   ,   0   ,   0   ,
              0 ,   0   ,   3   ,   1   ,   0   ,   0   ,
              0 ,   0   ,   0   ,   2   ,   2   ,   1   ,
              0 ,   0   ,   2   ,   3   ,   0   ,   0   ,
              0 ,   2   ,   2   ,   0   ,   0   ,   0), nrow=10, ncol=6, byrow = T)
rownames(A) <- paste("x", 1:10, sep="")
colnames(A) <- paste("y", 1:6, sep="")

Ordering rows and columns

Simple ordering by colsums and rowsums

ggplot2 works with long data instead of wide data format like a matrix, so you need to convert it first.

library(reshape2)
longData <- melt(A)

If you want to control the orderings of rows and columns you need to specify the ordering with factors of rows and columns. Levels are vectors of rownames and colnames of the original matrix.

library(dplyr)
longData$Var1 <- factor(longData$Var1, levels=names(sort(rowSums(A), decreasing = T)))
longData$Var2 <- factor(longData$Var2, levels=names(sort(colSums(A), decreasing = T)))

longData<-longData[longData$value!=0,]
library(ggplot2)
ggplot(longData, aes(x = Var2, y = Var1)) + 
  geom_raster(aes(fill=value)) + 
  scale_fill_gradient(low="grey60", high="red") +
  labs(x="", y="") +
  theme_bw()

plot of chunk raster1

Ordering with TSP

There is an excellent implementation of traveling sales person (TSP) ordering in seriation package. It randomly selects the first row or column in process of ordering, therefore every result of sorting is different.

library(seriation)
o <- seriate(A, method="BEA_TSP")
longData$Var1 <- factor(longData$Var1, levels=names(unlist(o[[1]][])))
longData$Var2 <- factor(longData$Var2, levels=names(unlist(o[[2]][])))
longData<-longData[longData$value!=0,]
ggplot(longData, aes(x = Var2, y = Var1)) + 
  geom_raster(aes(fill=value)) + 
  scale_fill_gradient(low="grey60", high="red") +
  labs(x="", y="") +
  theme_bw()

plot of chunk raster_TSP

Ordering by clusters

In this example, I use hierarchical clustering where distance matrix is calculated with generalized Jaccard similarity for row vectors and column vectors.

A <- A
sim.jac <- matrix(0, nrow=nrow(A), ncol=nrow(A))
rownames(sim.jac) <- rownames(A)
colnames(sim.jac) <- rownames(A)

#weighted jaccard for rows
pairs <- t(combn(1:nrow(A), 2))

for (i in 1:nrow(pairs)){
  num <- sum(sapply(1:ncol(A), function(x)(min(A[pairs[i,1],x],A[pairs[i,2],x]))))
  den <- sum(sapply(1:ncol(A), function(x)(max(A[pairs[i,1],x],A[pairs[i,2],x]))))
  sim.jac[pairs[i,1],pairs[i,2]] <- num/den
  sim.jac[pairs[i,2],pairs[i,1]] <- num/den  
}
sim.jac[which(is.na(sim.jac))] <- 0
diag(sim.jac) <- 1
sim.jac.rows <- sim.jac
A <- t(A)
sim.jac <- matrix(0, nrow=nrow(A), ncol=nrow(A))
rownames(sim.jac) <- rownames(A)
colnames(sim.jac) <- rownames(A)

#weighted jaccard for rows
pairs <- t(combn(1:nrow(A), 2))

for (i in 1:nrow(pairs)){
  num <- sum(sapply(1:ncol(A), function(x)(min(A[pairs[i,1],x],A[pairs[i,2],x]))))
  den <- sum(sapply(1:ncol(A), function(x)(max(A[pairs[i,1],x],A[pairs[i,2],x]))))
  sim.jac[pairs[i,1],pairs[i,2]] <- num/den
  sim.jac[pairs[i,2],pairs[i,1]] <- num/den  
}
sim.jac[which(is.na(sim.jac))] <- 0
diag(sim.jac) <- 1
sim.jac.cols <- sim.jac
Q <- A
longData<-melt(Q)
longData<-longData[longData$value!=0,]
longData <- left_join(longData, cut.rows, by=c("Var2"="names"))
colnames(longData)[4] <- "Var2_clust"
longData$Var2 <- as.factor(longData$Var2)
longData <- left_join(longData, cut.cols, by=c("Var1"="names"))
colnames(longData)[5] <- "Var1_clust"

longData$Var1 <- factor(longData$Var1, levels=unique(arrange(longData, Var1_clust)[,1]))
longData$Var2 <- factor(longData$Var2, levels=unique(arrange(longData, Var2_clust)[,2]))

#3 cluster
library(ggsci)
library(scales)
mypal <- pal_simpsons("springfield", alpha = 1)(5)[c(1,2,5)]
show_col(mypal)

plot of chunk preparation_of_ggplot

axis.x.colour % select(Var1, Var1_clust) %>% unique %>% arrange(Var1_clust) %>% select(Var1_clust))[,1] %>% plyr::mapvalues(from=c(1:3), to=mypal)

axis.y.colour % select(Var2, Var2_clust) %>% unique %>% arrange(Var2_clust) %>% select(Var2_clust))[,1] %>% plyr::mapvalues(from=c(1:3), to=mypal)
ggplot(longData, aes(x = Var1, y = Var2)) + 
  geom_raster(aes(fill=value)) + 
  scale_fill_gradient(low="grey60", high="red") +
  labs(x="", y="") +
  theme_bw() + theme(axis.text.x=element_text(size=11, face="bold"),
                     axis.text.y=element_text(size=11, face="bold"),
                     legend.text=element_text(size=8),
                     legend.title=element_text(size=8)) +
  theme(axis.text.y=element_text(colour=axis.y.colour),
        axis.text.x=element_text(colour=axis.x.colour)) +
  annotate("segment", x = 3.5, xend = 3.5, y = 0, yend = 10.5, colour = "blue", size = 0.5) +
  annotate("segment", x = 0, xend = 6.5, y = 2.5, yend = 2.5, colour = "blue", size = 0.5) +
  annotate("segment", x = 0, xend = 6.5, y = 5.5, yend = 5.5, colour = "blue", size = 0.5)

plot of chunk raster_hclust


Be happyR! 🙂

Advertisements