Commit ae1d65da authored by Christoffer Flensburg's avatar Christoffer Flensburg

adding all the data and scripts.

parent a1d37477
File mode changed from 100644 to 100755
This diff is collapsed.
File added
#load in the ASCAT calls that we got from the ASCAT authors
#hg38 version is lifted over from hg19.
loadASCAT = function(genome='hg38') {
if ( genome == 'hg19' ) {
load('~/gdc/TCGA/ASCATcalls/cnasHg19.Rdata')
return(cnasHg19)
}
if ( genome == 'hg38' ) {
load('~/gdc/TCGA/ASCATcalls/cnasHg38.Rdata')
return(cnasHg38)
}
}
#mean-variance and neighbour correction plots
#plot the neighbour boost histogram
plotNeighbour = function(CR, geneCounts, doHouseKeeping=F, quant=0.9, ...) {
if ( doHouseKeeping ) {
meanExp = rowMeans(as.matrix(geneCounts[,-1]))
houseExpression = quantile(log2(1+meanExp), probs=c(0.65, 0.98)) #housekeeping = 65%-98% expression quantile
maxPenalty = 1
distance = pmax(superFreq:::noneg(log2(1+meanExp)-houseExpression[2]), superFreq:::noneg(houseExpression[1]-log2(1+meanExp)))
expressionPenalty = pmin(maxPenalty,2*distance/(houseExpression[2]-houseExpression[1]))
correctedWidth = CR$width + expressionPenalty
houseVariance = quantile(correctedWidth[expressionPenalty==0], probs=c(0.5, 0.95)) #penalty for top 50% most variable housekeeping genes.
maxPenalty = 1
distance = superFreq:::noneg((correctedWidth-houseVariance[1]))
variancePenalty = pmin(maxPenalty,maxPenalty*distance/(houseVariance[2]-houseVariance[1]))
correctedWidth = correctedWidth + variancePenalty
housekeeping = expressionPenalty + variancePenalty == 0
CR = CR[housekeeping,]
CR = CR[order(CR$x1),]
}
x = (-1000:1000)/100
y1 = rt(100000, CR$df[1])
y2 = rt(100000, CR$df[1])
the = quantile(abs(y1-y2), probs=quant)
last = nrow(CR)
changeBoost = function(boost) quantile(abs((CR$M[-1]-CR$M[-last])/sqrt((CR$width[-1]+boost)^2+(CR$width[-last]+boost)^2)), probs=quant)
testRange = (0:1000)/1000
changes = sapply(testRange, changeBoost)
widthBoost = testRange[max(c(1, which(changes > the)))]
maxDev = max(abs(y1-y2), abs((CR$M[-1]-CR$M[-last])/sqrt((CR$width[-1])^2+(CR$width[-last])^2)),
abs((CR$M[-1]-CR$M[-last])/sqrt((CR$width[-1]+widthBoost)^2+(CR$width[-last]+widthBoost)^2)))
breaks = (0:50)/50*maxDev
xmax = quantile(abs((CR$M[-1]-CR$M[-last])/sqrt((CR$width[-1])^2+(CR$width[-last])^2)), probs=0.99)*1.2
h0 = hist(abs(y1-y2), plot=F, breaks=breaks)
h1 = hist(abs((CR$M[-1]-CR$M[-last])/sqrt((CR$width[-1])^2+(CR$width[-last])^2)), plot=F, breaks=breaks)
h2 = hist(abs((CR$M[-1]-CR$M[-last])/sqrt((CR$width[-1]+widthBoost)^2+(CR$width[-last]+widthBoost)^2)), plot=F, breaks=breaks)
xmax=10
plot(1, type='n', xlim=c(0, xmax), ylim=c(0.0001, 1), log='y', ...)
polygon(c(min(h0$mids),h0$mids), c(0,h0$density)+0.0001, col='grey')
lines(h1$mids, h1$density+0.0001, col=mcri('blue'), lwd=5)
lines(h2$mids, h2$density+0.0001, col=mcri('red'), lwd=3)
legend('topright', c('expected', 'no boost', 'boosted variance'), lwd=c(10, 5, 3),
col=mcri(c('grey', 'blue', 'red')))
}
#compares fraction of genome that agrees within the LFC and BAF limits between the two sets of CNA calls.
fractionAgreeingBoth = function(cnaCalls1, cnaCalls2, LFClimit=0.1, BAFlimit=0.05, includeSex=F) {
agreement = sapply(intersect(names(cnaCalls1), names(cnaCalls2)), function(run) {
cna1 = cnaCalls1[[run]]
if ( class(cna1) != 'data.frame' ) return(NA)
cna2 = cnaCalls2[[run]]
if ( class(cna2) != 'data.frame' ) return(NA)
if ( !includeSex ) {
cna1 = cna1[!(xToChr(cna1$x1, 'hg38') %in% c('X', 'Y')),]
cna2 = cna2[!(xToChr(cna2$x1, 'hg38') %in% c('X', 'Y')),]
}
i1 = 1
i2 = 1
totalOverlap = 0
overlapWithinError = 0
while ( i1 <= nrow(cna1) & i2 <= nrow(cna2) ) {
x11 = cna1$x1[i1]
x12 = cna1$x2[i1]
x21 = cna2$x1[i2]
x22 = cna2$x2[i2]
overlap = superFreq:::noneg(min(x22, x12) - max(x11, x21))
if ( overlap > 0 ) {
totalOverlap = totalOverlap + overlap
if ( abs(cna1$M[i1]-cna2$M[i2]) < LFClimit &
(abs(cna1$f[i1]-cna2$f[i2]) < BAFlimit | is.na(cna1$f[i1]-cna2$f[i2]) |
(cna1$call[i1]=='AB' & cna2$postHet[i2] > 0.05)) )
overlapWithinError = overlapWithinError + overlap
}
if ( x12 <= x22 ) i1 = i1+1
else i2 = i2+1
}
return(overlapWithinError/totalOverlap)
})
return(agreement)
}
#a hacked version of vioplot that doesnt draw a new frame for every call with add=T.
vioplotMod =
function (x, ..., range = 1.5, h = NULL, ylim = NULL, names = NULL,
horizontal = FALSE, col = "magenta", border = "black", lty = 1,
lwd = 1, rectCol = "black", colMed = "white", pchMed = 19,
at, add = FALSE, wex = 1, drawRect = TRUE, force.est.xlim=NULL)
{
datas <- list(x, ...)
n <- length(datas)
if (missing(at))
at <- 1:n
upper <- vector(mode = "numeric", length = n)
lower <- vector(mode = "numeric", length = n)
q1 <- vector(mode = "numeric", length = n)
q3 <- vector(mode = "numeric", length = n)
med <- vector(mode = "numeric", length = n)
base <- vector(mode = "list", length = n)
height <- vector(mode = "list", length = n)
baserange <- c(Inf, -Inf)
args <- list(display = "none")
if (!(is.null(h)))
args <- c(args, h = h)
for (i in 1:n) {
data <- datas[[i]]
data.min <- min(data)
data.max <- max(data)
q1[i] <- quantile(data, 0.25, na.rm=T)
q3[i] <- quantile(data, 0.75, na.rm=T)
med[i] <- median(data, na.rm=T)
iqd <- q3[i] - q1[i]
upper[i] <- min(q3[i] + range * iqd, data.max)
lower[i] <- max(q1[i] - range * iqd, data.min)
est.xlim <- c(min(lower[i], data.min), max(upper[i],
data.max))
if ( !is.null(force.est.xlim) ) est.xlim = force.est.xlim
smout <- do.call("sm.density", c(list(data, xlim = est.xlim),
args))
hscale <- 0.4/max(smout$estimate) * wex
base[[i]] <- smout$eval.points
height[[i]] <- smout$estimate * hscale
t <- range(base[[i]])
baserange[1] <- min(baserange[1], t[1])
baserange[2] <- max(baserange[2], t[2])
}
if (!add) {
xlim <- if (n == 1)
at + c(-0.5, 0.5)
else range(at) + min(diff(at))/2 * c(-1, 1)
if (is.null(ylim)) {
ylim <- baserange
}
}
if (is.null(names)) {
label <- 1:n
}
else {
label <- names
}
boxwidth <- 0.05 * wex
if (!add)
plot.new()
if (!horizontal) {
if (!add) {
plot.window(xlim = xlim, ylim = ylim)
axis(2)
axis(1, at = at, label = label)
}
for (i in 1:n) {
polygon(c(at[i] - height[[i]], rev(at[i] + height[[i]])),
c(base[[i]], rev(base[[i]])), col = col, border = border,
lty = lty, lwd = lwd)
if (drawRect) {
lines(at[c(i, i)], c(lower[i], upper[i]), lwd = lwd,
lty = lty)
rect(at[i] - boxwidth/2, q1[i], at[i] + boxwidth/2,
q3[i], col = rectCol)
points(at[i], med[i], pch = pchMed, col = colMed)
}
}
}
else {
if (!add) {
plot.window(xlim = ylim, ylim = xlim)
axis(1)
axis(2, at = at, label = label)
}
for (i in 1:n) {
polygon(c(base[[i]], rev(base[[i]])), c(at[i] - height[[i]],
rev(at[i] + height[[i]])), col = col, border = border,
lty = lty, lwd = lwd)
if (drawRect) {
lines(c(lower[i], upper[i]), at[c(i, i)], lwd = lwd,
lty = lty)
rect(q1[i], at[i] - boxwidth/2, q3[i], at[i] +
boxwidth/2, col = rectCol)
points(med[i], at[i], pch = pchMed, col = colMed)
}
}
}
invisible(list(upper = upper, lower = lower, median = med,
q1 = q1, q3 = q3))
}
#recall as function of size
getRecall = function(recallList, breaks=30) {
sizeRecall = do.call(cbind, recallList)
sizeBreaks = c(0,round(exp((0:breaks)/breaks*log(max(sizeRecall+1))))) + 0.5
sizeBreaks = sizeBreaks[!duplicated(sizeBreaks)]
mids = hist(1, breaks=sizeBreaks, plot=F)$mids
recall = sapply(1:4, function(i) {
gain = sizeRecall[i,] > 0
gainRecalled = sizeRecall[i+4,] > 0.5*sizeRecall[i,]
gainAllH = hist(sizeRecall[i,gain], breaks=sizeBreaks, plot=F)
gainRecallH = hist(sizeRecall[i,gainRecalled], breaks=sizeBreaks, plot=F)
return(gainRecallH$counts/gainAllH$counts)
})
counts = sapply(1:4, function(i) {
gain = sizeRecall[i,] > 0
gainRecalled = sizeRecall[i+4,] > 0.5*sizeRecall[i,]
gainAllH = hist(sizeRecall[i,gain], breaks=sizeBreaks, plot=F)
gainRecallH = hist(sizeRecall[i,gainRecalled], breaks=sizeBreaks, plot=F)
return(gainAllH$counts)
})
return(list(recall=recall, counts=counts, mids=mids))
}
#cnaCalls2 recall of the class in cnaCalls1, as function of number of genes covered.
fractionCallsRecalled = function(cnaCalls1, cnaCalls2, crData2, includeSex=F, cpus=5, minPurity=0, maxPurity=Inf) {
cat('Running recall by size')
recall = mclapply(intersect(names(cnaCalls1), names(cnaCalls2)), function(run) {
cat('.')
cna1 = cnaCalls1[[run]]
if ( class(cna1) != 'data.frame' ) return(NA)
cna2 = cnaCalls2[[run]]
if ( class(cna2) != 'data.frame' ) return(NA)
cr2 = crData2[[run]]
if ( class(cr2) != 'data.frame' ) return(NA)
if ( !includeSex ) {
cna1 = cna1[!(xToChr(cna1$x1, 'hg38') %in% c('X', 'Y')),]
cna2 = cna2[!(xToChr(cna2$x1, 'hg38') %in% c('X', 'Y')),]
cr2 = cr2[!(xToChr(cr2$x1, 'hg38') %in% c('X', 'Y')),]
}
#as we only classify by gain, merge neoughbouring regions with different levels of gain.
N = nrow(cna1)
toMerge = which(cna1$chromosome[1:(N-1)] == cna1$chromosome[2:N] & nchar(cna1$call[1:(N-1)]) > 2 & nchar(cna1$call[2:N]) > 2)
while( length(toMerge) > 0 ) {
merge = toMerge[1]
cna1$x2[merge] = cna1$x2[merge+1]
cna1$end[merge] = cna1$end[merge+1]
cna1 = cna1[-merge,]
toMerge = which(cna1$chromosome[1:(N-1)] == cna1$chromosome[2:N] & nchar(cna1$call[1:(N-1)]) > 2 & nchar(cna1$call[2:N]) > 2)
}
results = sapply(1:nrow(cna1), function(chr) {
cna1Chr = cna1[chr,]
cna2Chr = cna2[cna2$x1 < cna1Chr$x2 & cna2$x2 > cna1Chr$x1,]
cr2Chr = cr2[cr2$x1 < cna1Chr$x2 & cr2$x2 > cna1Chr$x1,]
if ( nrow(cr2Chr) == 0 )
return(c(gain1true=0, wt1true=0, loss1true=0, loh1true=0,
gainRecall=0, wtRecall=0, lossRecall=0, lohRecall=0))
call1 = cna1Chr$call
call2 = cna2Chr$call
crx = (cr2Chr$x1+cr2Chr$x2)/2
nGenes1 = rowSums(matrix(sapply(crx, function(x) x > cna1Chr$x1) &
sapply(crx, function(x) x < cna1Chr$x2), nrow=nrow(cna1Chr)))
overlap = sapply(1:nrow(cna1Chr), function(r1) {
sapply(1:nrow(cna2Chr), function(r2) {
sum(crx > max(cna1Chr$x1[r1], cna2Chr$x1[r2]) & crx < min(cna1Chr$x2[r1], cna2Chr$x2[r2]))
})
})
overlap = matrix(overlap, nrow=nrow(cna2Chr), ncol=nrow(cna1Chr))
nA1 = nchar(call1) - nchar(gsub('A', '', call1))
nA2 = nchar(call2) - nchar(gsub('A', '', call2))
nB1 = nchar(call1) - nchar(gsub('B', '', call1))
nB2 = nchar(call2) - nchar(gsub('B', '', call2))
n1 = nA1 + nB1
n2 = nA2 + nB2
gain1 = n1>2
wt1 = nA1==1 & nB1==1
loss1 = n1<2
loh1 = n1==2 & nA1*nB1==0
gain2 = n2>2
wt2 = nA2==1 & nB2==1
loss2 = n2<2
loh2 = n2==2 & nA2*nB2==0
gain1true = sum(nGenes1*(n1>2))
wt1true = sum(nGenes1*(nA1==1 & nB1==1))
loss1true = sum(nGenes1*(n1<2))
loh1true = sum(nGenes1*(n1==2 & nA1*nB1==0))
gainRecall = sum(t(overlap*gain2)*gain1)
wtRecall = sum(t(overlap*wt2)*wt1)
lossRecall = sum(t(overlap*loss2)*loss1)
lohRecall = sum(t(overlap*loh2)*loh1)
return(c(gain1true=gain1true, wt1true=wt1true, loss1true=loss1true, loh1true=loh1true,
gainRecall=gainRecall, wtRecall=wtRecall, lossRecall=lossRecall, lohRecall=lohRecall))
})
return(results)
}, mc.cores=cpus)
cat('done.\n')
return(recall)
}
#plots recall as function of size
plotRecall = function(recall, counts, mids, add=F, al=1, ...) {
if ( !add ) plot(0, type='n', xlim=c(0.5, max(mids)), log='x', ylim=c(0,1), frame=F, xlab='', ylab='', xaxt='n', ...)
title(xlab="# genes in segment", ylab='recall', line=2, cex.lab=1)
axis(1, at=c(1,10,100,1000))
lines(mids, recall[,1], col=mcri('red', al=al), lwd=.5)
points(mids, recall[,1], col=mcri('red', al=al), cex=pmin(1.5,sqrt(counts[,1]/30)), pch=16)
lines(mids, recall[,3], col=mcri('blue', al=al), lwd=.5)
points(mids, recall[,3], col=mcri('blue', al=al), cex=pmin(1.5,sqrt(counts[,3]/30)), pch=16)
lines(mids, recall[,4], col=mcri('green', al=al), lwd=.5)
points(mids, recall[,4], col=mcri('green', al=al), cex=pmin(1.5,sqrt(counts[,4]/30)), pch=16)
if ( !add ) legend('topleft', c('gain', 'loss', 'LOH'), pch=16, lwd=1, col=mcri(c('red', 'blue', 'green')))
}
#plot a heatmap of the CNAs
plotCNAheatmap = function(clusters, genome='hg19', separator=F, includeSexChr=T, printSampleNames=T, chr.cex=1, chr.stagger=0, chr.staggerFrom=1, ...) {
N = length(clusters)
cLs = chrLengths(genome)
if ( !includeSexChr ) cLs = cLs[names(cLs) != c('X', 'Y')]
plot(1, type='n', xlim=c(-sum(cLs)*0.02, sum(cLs)), ylim=c(0., N*1.1), xlab='', ylab='', xaxt='n', yaxt='n', frame.plot=F, ...)
superFreq:::addChromosomeLines(ylim=c(0.5, N*1.05), col=mcri('grey'), genome=genome,
onlyNumbers=!includeSexChr, cex=chr.cex, stagger=chr.stagger,
staggerFrom=chr.staggerFrom)
if ( separator ) segments(rep(0,N+1), 0.5+0:N, rep(sum(cLs),N), 0.5+0:N, col=mcri('grey'), lwd=0.5)
xmax = sum(cLs)
ymin = 0.5
ymax = N + 0.5
segments(c(0,0,0,xmax), c(ymin,ymin,ymax,ymin), c(0,xmax,xmax,xmax), c(ymax,ymin,ymax,ymax), lwd=1)
for ( ind in names(clusters) ) {
i = which(names(clusters) == ind)
cluster = clusters[[ind]]
if ( !includeSexChr ) cluster = cluster[!(xToChr(cluster$x1, genome) %in% c('X', 'Y')),]
isCNN = cluster$call == 'AA'
isAB = cluster$call == 'AB'
red = ifelse(isCNN | isAB, 0, superFreq:::noneg(pmin(cluster$M,1)))
blue = ifelse(isCNN | isAB, 0, superFreq:::noneg(pmin(-cluster$M,1)))
green = ifelse(isCNN, cluster$clonality, 0)
col = ifelse(red > 0, mcri('red', red), ifelse(blue > 0, mcri('blue', blue), mcri('green', green)))
width = ifelse(red+green+blue > 0.07, 1, 0)
x1 = cluster$x1
x2 = cluster$x2
rect(x1, i-width/2, x2, i+width/2, col=col, border=F)
if ( printSampleNames ) text(-1.5e7, i, gsub('\\.S[0-9]+$', '', ind), adj=1, cex=1)
}
}
#plot CNAs across a smaller region over a gene of interest.
plotFocalHeatmap = function(clusters, range, printSampleNames=T, padding=500e3, ...) {
N = length(clusters)
rangeOI = range
range = c(range[1]-padding, range[2]+padding)
focalClusters = lapply(clusters, function(cluster) cluster[cluster$x2 > range[1] & cluster$x1 < range[2],])
xShift = mean(range)
plot(1, type='n', xlim=(c(range[1], 1.1*range[2] - 0.1*range[1])-xShift)/1e3, ylim=c(0., N*1.1), xlab='', ylab='', yaxt='n', frame.plot=F, ...)
title(xlab="distance (kbp)", ylab='', line=2, cex.lab=1)
for ( ind in names(focalClusters) ) {
i = which(names(focalClusters) == ind)
cluster = focalClusters[[ind]]
isCNN = cluster$call == 'AA'
isAB = cluster$call == 'AB'
red = ifelse(isCNN | isAB, 0, superFreq:::noneg(cluster$M))
blue = ifelse(isCNN | isAB, 0, 0)
green = ifelse(isCNN, 0, 0)
col = ifelse(red > 0, mcri('red', pmin(1,red)), ifelse(blue > 0, mcri('blue', pmin(1,blue)), mcri('green', green)))
width = 1
x1 = (cluster$x1-xShift)/1e3
x2 = (cluster$x2-xShift)/1e3
rect(x1, i-width/2, x2, i+width/2, col=col, border=F)
if ( printSampleNames ) text(-1.5e7, i, gsub('\\.S[0-9]+$', '', ind), adj=1, cex=1)
}
segments((rangeOI-xShift)/1e3, 0, (rangeOI-xShift)/1e3, N+1, lwd=1, col=mcri('blue'))
}
#helper functions calculating the size and magnitude of amplifications over a range.
ampSizes = function(clusters, range, cr) {
xmid = mean(range)
focalClusters = lapply(clusters, function(cluster) cluster[cluster$x1 < xmid & cluster$x2 > xmid,])
ret = sapply(focalClusters, function(cluster) {
if ( nrow(cluster) != 1 ) return(NA)
if ( nchar(cluster$call) > 3 & cluster$x2 - cluster$x1 < 20e6 ) return(sum(cluster$x1 <= cr$x1 & cluster$x2 >= cr$x2))
return(NA)
})
return(ret)
}
ampMagnitude = function(clusters, range, cr) {
xmid = mean(range)
focalClusters = lapply(clusters, function(cluster) cluster[cluster$x1 < xmid & cluster$x2 > xmid,])
ret = sapply(focalClusters, function(cluster) {
if ( nrow(cluster) != 1 ) return(NA)
if ( nchar(cluster$call) > 3 & cluster$x2 - cluster$x1 < 20e6 ) return(nchar(cluster$call))
return(NA)
})
return(ret)
}
#returns a matrix of difference in LFC across the genome for each sample.
differenceCall = function(cnaCalls1, cnaCalls2, includeSex=F) {
difference = lapply(intersect(names(cnaCalls1), names(cnaCalls2)), function(run) {
cna1 = cnaCalls1[[run]]
if ( class(cna1) != 'data.frame' ) return(NA)
cna2 = cnaCalls2[[run]]
if ( class(cna2) != 'data.frame' ) return(NA)
if ( !includeSex ) {
cna1 = cna1[!(xToChr(cna1$x1, 'hg38') %in% c('X', 'Y')),]
cna2 = cna2[!(xToChr(cna2$x1, 'hg38') %in% c('X', 'Y')),]
}
#use 1Mbp bins
agreement = rep(0, sum(chrLengths('hg38'))/1e6)
disagreement = rep(0, sum(chrLengths('hg38'))/1e6)
i1 = 1
i2 = 1
totalOverlap = 0
overlapWithinError = 0
while ( i1 <= nrow(cna1) & i2 <= nrow(cna2) ) {
x11 = cna1$x1[i1]
x12 = cna1$x2[i1]
x21 = cna2$x1[i2]
x22 = cna2$x2[i2]
overlap = superFreq:::noneg(min(x22, x12) - max(x11, x21))
if ( overlap > 0 ) {
totalOverlap = totalOverlap + overlap
call1 = cna1$call[i1]
call2 = cna2$call[i2]
nA1 = nchar(call1) - nchar(gsub('A', '', call1))
nA2 = nchar(call2) - nchar(gsub('A', '', call2))
nB1 = nchar(call1) - nchar(gsub('B', '', call1))
nB2 = nchar(call2) - nchar(gsub('B', '', call2))
n1 = nA1 + nB1
n2 = nA2 + nB2
bins = ceiling(max(x11,x21)/1e6):ceiling(min(x22,x12)/1e6)
if ((n1 > 2 & n2 > 2) | #both gain
(nA1==1 & nA2==1 & nB1==1 & nB2==1 ) | #both normal
(n1 < 2 & n2 < 2) | #both loss
(n1==2 & n2==2 & nA1*nB1==0 & nA2*nB2==0) #both CNN LOH
)
agreement[bins] = 1
else
disagreement[bins] = 1
}
if ( x12 <= x22 ) i1 = i1+1
else i2 = i2+1
}
return(list(agreement=agreement, disagreement=disagreement))
})
agreement = sapply(difference, function(diff) diff$agreement)
disagreement = sapply(difference, function(diff) diff$disagreement)
return(list(agreement=agreement, disagreement=disagreement))
}
#smoothing function
multiBlur = function(x, range, repeats) {
if ( repeats == 0 ) return(x)
for ( i in 1:repeats )
x = blurN(x, range)
return(x)
}
blurN = function(vec, M) {
N = length(vec)
vec = c(rev(vec), vec, rev(vec))
ret = 1:N
for(i in 1:N) {
ret[i] = sum(vec[N + (i-M):(i+M)])/(2*M+1)
}
return(ret)
}
smoothLine = function(x, y, range=5, repeats=15, ...) {