## Collects probe index information and returns data table in the same order as AffyBatch intensity matrix getProbeInfo.index <- function(affyData) { probeset.id <- probeNames(affyData) # for each set, probe are indexed in reverse order of occurence in intensity matrix probe.index <- ave(seq(length(probeset.id), 1, -1), probeset.id, FUN=rank) return(data.frame(probeset.id = probeset.id, x = probe.index, stringsAsFactors=F)) } ## Helper function to return appopriate probe info getProbeInfo <- function(affyData, location.type = "index", location.file.dir = NULL) { if (location.type == "index") { return(getProbeInfo.index(affyData)) } else { return(getProbeInfo.absolute(affyData, location.file.dir)) } } ## Collect absolute probe location information ## and returns data table in the same order as AffyBatch intensity matrix getProbeInfo.absolute <- function(affyData, location.file.dir = NULL) { location.file = file.path(location.file.dir, paste(cdfName(affyData), ".probe_distances.Rd", sep="")) tryCatch({ suppressWarnings(load(location.file)) }, error = function(err) { stop(paste("Cannot open probe location file. Make sure that directory given by", " location.file.dir (\"", location.file.dir, "\") contains the respective file", " for the current chip type (", cdfName(affyData), "). Probe location files", " for all Affymetrix expression arrays can be found at", " http://www.izbi.uni-leipzig.de/downloads_links/programs/rna_integrity.php", sep=""), call.=FALSE) }) ## Add affy indices as rownames to that table (xy2indices) options(scipen = 99) # This is important: otherwise indices like 400000 are # transformed into "4e5" leading to bugs rownames(probeDists) <- apply(probeDists[c("Probe.X","Probe.Y")], 1, function(x) xy2indices(x[1], x[2], abatch=affyData)) ## Create table in same order as pm() containing probeset IDs, distance ## ... mapping the probe distances using the affy probe numbers (x*width +y) ## @note Control probes must be handled carefully: insert pseudo-values probeInfo <- data.frame(pmI = pm(affyData[,1]), probeset.id = "control", x = NA, stringsAsFactors=F) probeInfo[,1] <- NULL # delete row with intensities - only used as a template probeInfo[rownames(probeDists), c("probeset.id","x")] <- probeDists[,c("Probe.Set.Name","Probe.Distance")] return(probeInfo) } DecayFunction <- function(x, d.inf, lambda.inv, shiftY) { (1 - d.inf)*exp(-lambda.inv*x) + d.inf + shiftY } TryToFitDecayFunction <- function(x, means, defaultD1, defaultD2, defaultShiftY= 0) { params <- list(d1 = defaultD1, d2 = defaultD2, shiftY = defaultShiftY) p = try({ p2 = try({ fit = nls(means ~ DecayFunction(x, d1, d2, shiftY), start=list(d1 = defaultD1, d2 = defaultD2, shiftY = defaultShiftY), lower = c(0, 0.01, -0.2), upper = c(1.1, 5, 1), algorithm="port")}, silent=T) #Use lower and upper bounds for robustness ## If the fit does not converge, assume d1=0 if(inherits(p2, "try-error")) { cat("Warning: Convergence error. Setting d1=0.\n") fit <- nls(means ~ DecayFunction(x, 0, d2, shiftY), start = list(d2 = defaultD2, shiftY = defaultShiftY), lower = c(0.005, -0.2), upper = c(2, 0.3), algorithm="port") params$d1 <- 0 } else { params$d1 <- coef(fit)["d1"] } params$shiftY <- coef(fit)["shiftY"] params$d2 <- coef(fit)["d2"] }, silent=T) if(inherits(p, "try-error")) { cat("Convergence error. Using default degradation parameters. \n") } return(params) } ## Creates tongs table ComputeTongs <- function(x, intensity, sigma, probeset.id, min.x = 1, max.x = 11, mvg.avg.size = 450) { ## Compute boundaries (x) of upper and lower tongs (=upper and lower third of x) tongs.probes.per.probeset <- floor((max.x - min.x) / 3) p3.probes.max.x <- min.x + tongs.probes.per.probeset - 1 # 3' probes p5.probes.max.x <- max.x - tongs.probes.per.probeset + 1 # 5' probes ## Remove all probes outside min.x and max.x boundaries subset.x <- x >= min.x & x <= max.x x <- x[subset.x] intensity <- intensity[subset.x] sigma <- sigma[subset.x] probeset.id <- probeset.id[subset.x] ## Firstly, select probesets where upper and lower tongs can be computed, that is, there exists ## a probe within the upper and lower third of x available min.x.ps <- tapply(x, probeset.id, min) max.x.ps <- tapply(x, probeset.id, max) tongs.is.possible <- min.x.ps <= p3.probes.max.x & max.x.ps >= p5.probes.max.x tongs.is.possible.ps <- rownames(tongs.is.possible)[tongs.is.possible == T] subset.x <- probeset.id %in% tongs.is.possible.ps x <- x[subset.x] intensity <- intensity[subset.x] sigma <- sigma[subset.x] probeset.id <- probeset.id[subset.x] intensity <- log10(intensity) # Use log 10 ## Compute probset-based averages sigma.i <- tapply(sigma, probeset.id, ProbesetAverage) # mean von gleichen werten ist der wert... (falls sigma schon probeset average ist) average.i <- tapply(intensity, probeset.id, ProbesetAverage) is.p3.x <- x <= p3.probes.max.x p3.ps.i <- tapply(intensity[is.p3.x], probeset.id[is.p3.x], ProbesetAverage) is.p5.x <- x >= p5.probes.max.x p5.ps.i <- tapply(intensity[is.p5.x], probeset.id[is.p5.x], ProbesetAverage) mean.x <- tapply(x, probeset.id, mean) tongs <- data.frame(sigma = sigma.i, mean.x = mean.x, p3.i = p3.ps.i, p5.i = p5.ps.i, average.i = average.i, row.names = rownames(mean.x)) # Apply moving average values tongs <- tongs[order(tongs$sigma),] tongs[,"p5.tongs"] <- filter(tongs[,"p5.i"] - tongs[,"average.i"], rep(1/mvg.avg.size,mvg.avg.size)) tongs[,"p3.tongs"] <- filter(tongs[,"p3.i"] - tongs[,"average.i"], rep(1/mvg.avg.size,mvg.avg.size)) tongs <- na.omit(tongs) # remove NA rows created by filter tongs[,"hook"] <- tongs[,"p3.tongs"] - tongs[,"p5.tongs"] return(tongs) } EstimateHookParams <- function(tongs) { tongs.loess <- loess(hook ~ sigma, tongs, span=2/3) hook.params <- list() hook.params$max.raw <- max(tongs$hook) # tongsParamRaw hook.params$max.fit <- max(tongs.loess$fit) # tongsParamFit tongs$hook <- tongs.loess$fit / max(tongs.loess$fit) tongs$hook <- pmax(tongs$hook, 0) # forbid negatives ## Estimate non-specific threshold as maximum sigma where (smoothed) hook is smaller than 0.02 hook.params$ns.threshold <- quantile(tongs$sigma, 0.30) # consider 30% smallest EMs as non-specific hook.params$hook.max.sigma <- tongs[min(which(tongs$hook == max(tongs$hook))),"sigma"] ## Alternate estimate hook.params$ns.threshold2 <- max(tongs$sigma[ tongs.loess$fit < 0.02 & tongs$sigma < hook.params$hook.max.sigma]) return(hook.params) } # Currently only give tongs for index based variant, x=k GetTongs <- function(affyData, chip.idx = 1) { probe.info <- getProbeInfo(affyData, "index") intensity <- as.vector(pm(affyData[,chip.idx])) sigma <- ave(intensity, list(probe.info$probeset.id), FUN=ProbesetLogAverage) ## Create tongs and estimate parameters tongs <- ComputeTongs(probe.info$x, intensity, sigma, probe.info$probeset.id, 1, 11) return(tongs) } # Plotting helper PlotTongs <- function(tongs) { ymax <- 0.35 ymin <- -0.35 plot(tongs[,"sigma"], tongs[,"p5.tongs"], type="l", ylim = c(ymin, ymax), xlab=expression(Sigma), ylab=expression(Sigma[subset] - Sigma), main=paste("Tongs plot"), panel.first = grid()) lines(tongs[,"sigma"], tongs[,"p3.tongs"], col=2) legend("topleft", c("5' subset", "3' subset"), fill=seq(1,2)) } ## Computes average intensity per interval or probe index getGeneralDegradationMeans <- function(x, intensity, min.x = 1, max.x = 11, breaks = 11) { subset.x <- x >= min.x & x <= max.x x <- x[subset.x] intensity <- intensity[subset.x] if(length(table(x)) <= breaks) { # small number of positions -> index correction -> simple average intensity <- log10(intensity) mean.intensity <- tapply(intensity, x, mean) return(data.frame(x = as.numeric(names(mean.intensity)), i = 10^(mean.intensity))) } else { whist <- hist(x, breaks = breaks, plot=F) intervals<-length(whist$counts) absoluteMeans <- data.frame(x = rep(0,intervals), i = rep(0,intervals)) for(i in 1:intervals){ ## if (whist$counts[i] > 1) { absoluteMeans[i,"x"] <- mean(c(whist$breaks[i], whist$breaks[i+1])) absoluteMeans[i,"i"] <- 10^mean(log10(intensity[x >= whist$breaks[i] & x < whist$breaks[i+1]]), na.rm = T) } return(absoluteMeans) } } # Function to plot decay function d(x) for x in k,L PlotDx <- function(means, min.x = 1, max.x = 11, scale.x = 1, ...) { d.x <- means$i / means$i[1] theoretic.d <- TryToFitDecayFunction((means$x - min.x)/scale.x, d.x, d.x[length(d.x)], 0.6) # - min.x to scale to 0 plot(0, xlim=c(0,max.x), ylim=c(0.1,1.2), pch=".", col="grey20", xlab="x", ylab="D(x)", panel.first = grid(), ...) points(means$x, d.x, pch=17) xx <- seq(min(0, min.x), max.x, length.out = 100) lines(xx, DecayFunction((xx - min.x) / scale.x, theoretic.d$d1, theoretic.d$d2, theoretic.d$shiftY), lty = 2, col = "grey60", lwd = 2) } ProbesetLogAverage <- function(x) { mean(log10(x)) } ProbesetAverage <- function(x) { mean((x)) } ## Settings for index/absolue correction kCorrectionTypeSettings <- list( index = list(type.x = "index", scale.x = 1, min.x = 1, max.x = 11, n.breaks = 11), absolute = list(type.x = "distance", scale.x = 60, min.x = 0, max.x = 600, n.breaks = 12)) DegradationAnalysis <- function(affyData, location.type = "index", location.file.dir = NULL, plot.images = FALSE) { ## Get probe Info probe.info <- getProbeInfo(affyData, location.type, location.file.dir) if (location.type == "index") { ## scale.x <- 1 min.x <- 1 max.x <- 11 n.breaks <- 11 positions <- min.x:n.breaks } else { scale.x <- 60 # enables to use the same defaults for fitting in absolute and index min.x <- 0 max.x <- 600 n.breaks <- 12 positions <- seq((max.x-min.x)/n.breaks/2, by = (max.x-min.x)/n.breaks, length.out = n.breaks) ## Print some statistics about absolute probe locations message(round(sum(!is.na(probe.info$x )) / nrow(probe.info), 2)*100, "% of all probes can be aligned to their target mRNA", " for the current chip type (", cdfName(affyData), ").", sep="") } ## Get basic probe and probeset information x <- probe.info$x probeset.id <- probe.info$probeset.id probeset.size <- ave(probe.info$probeset.id, list(probe.info$probeset.id), FUN=length) ## Set up statistcs table stats.all <- data.frame(d.inf = rep(0, length(affyData)), lambda.inv = 0, shift.y = 0, tongs.max.raw = 0, tongs.max.fit = 0, ns.threshold = 0, hook.max.sigma = 0, placeholder = 0, decay = 0, row.names = sampleNames(affyData)) means.pm.all <- matrix(0, n.breaks, length(affyData), dimnames = list(x.center = positions, sample = sampleNames(affyData))) means.mm.all <- matrix(0, n.breaks, length(affyData), dimnames = list(x.center = positions, sample = sampleNames(affyData))) ## Iterate over chips for(chip.idx in 1:length(affyData)) { currentSample = sampleNames(affyData)[chip.idx] message("Computing degradation for chip", chip.idx, ":", currentSample,"\n") ## Create main table of chip probes intensity <- as.vector(pm(affyData[,chip.idx])) sigma <- ave(intensity, list(probe.info$probeset.id), FUN=ProbesetLogAverage) ## Create tongs and estimate parameters tongs <- ComputeTongs(x, intensity, sigma, probeset.id, min.x, max.x) hook.params <- EstimateHookParams(tongs) if (plot.images) PlotTongs(tongs) ## Compute d(x) once (Means and Fit) h.max <- hook.params$hook.max.sigma subset.probes <- (probeset.size == 11) & (h.max - 0.4 < sigma) & (sigma < h.max + 0.2) means <- getGeneralDegradationMeans(x[subset.probes], intensity[subset.probes], min.x, max.x) d.x <- means$i / means$i[1] theoretic.d.s <- TryToFitDecayFunction((means$x - min.x)/scale.x, d.x, d.x[length(d.x)], 0.6) decay = (d.x[length(d.x)] + d.x[length(d.x)-1]) / 2 ## Compute scaling function for chip x.limited = pmin(x, max.x) # probes located more far away than max.x are corrected using max.x theoretic.d.s.all <- with(theoretic.d.s, DecayFunction((x.limited - min.x)/scale.x, d1, d2, shiftY)) tongs.loess <- loess(hook ~ sigma, tongs, span=2/3) ## Define scaling function such that the average scaling of the region that has been used to ## estimate D(s) is 1. scale.f.y <- pmax(0, tongs.loess$fit) # scaling must never be smaller than 0 average.scaling.in.specific <- mean(scale.f.y[h.max - 0.4 < tongs$sigma & tongs$sigma < h.max + 0.2]) scale.f.y <- scale.f.y / average.scaling.in.specific ## Separately correct pm and mm for(probe.type in c("mm","pm")) { # mm before pm, to avoid using previously corrected pm values ## Assign PM or MM accessor functions, for convenience if (probe.type == "pm") { pmOrMm <- pm `pmOrMm<-` <- `pm<-` } else { pmOrMm <- mm `pmOrMm<-` <- `mm<-` } intensity <- as.vector(pmOrMm(affyData[,chip.idx])) ## Use separate sigma for PM AND MM sigma <- ave(intensity, list(probeset.id), FUN=ProbesetLogAverage) means <- getGeneralDegradationMeans(x[subset.probes], intensity[subset.probes], min.x, max.x) # print(means) if (plot.images) PlotDx(means, min.x, max.x, scale.x) ## Save PM/MM mean statistics if (probe.type == "pm") { means.pm.all[,chip.idx] <- means$i } else { means.mm.all[,chip.idx] <- means$i } ## Apply scaling function to sigma and x values if (plot.images) plot(tongs$sigma, scale.f.y, main=paste("Scaling function, ", probe.type)) scaling.x <- approx(tongs$sigma, scale.f.y, sigma, rule=2)$y probe.scale <- 1 - scaling.x*(1 - theoretic.d.s.all) # equals scaling.x*theoretic.d.s.all + (1-scaling.x) ## It is mandatory to limit the scaling factor - scalings <0 confound correction (like hook) probe.scale <- pmax(0.1, probe.scale) probe.scale <- pmin(1.0, probe.scale) ## Update pm and mm data pmOrMm(affyData)[,chip.idx] <- pmOrMm(affyData)[,chip.idx] / probe.scale if (plot.images) { means <- getGeneralDegradationMeans(x[subset.probes], as.vector(pmOrMm(affyData[,chip.idx]))[subset.probes], min.x, max.x) PlotDx(means, min.x, max.x, scale.x) tongs.cor <- ComputeTongs(x, as.vector(pmOrMm(affyData[,chip.idx])), sigma, probeset.id, min.x, max.x) PlotTongs(tongs.cor) } } stats.all[chip.idx, ] <- c(unlist(c(theoretic.d.s,hook.params)), decay) } return(list(location.type = location.type, afbatch = affyData, stats = stats.all, means.pm = means.pm.all, means.mm = means.mm.all)) } plotDegradation <- function(object) { means = object$means.pm x = as.numeric(rownames(means)) with(kCorrectionTypeSettings[[ object$location.type ]], { ## Use index/absolute specific min.x, max.x, scale.c plot(0, xlim=c(min.x, max.x), ylim=c(0.1,1.2), pch=".", col="grey20", xlab="x", ylab="D(x)", panel.first = grid()) cols <- rainbow(ncol(means)) for (i in seq(1, ncol(means))) { d.x <- means[, i] / means[1, i] theoretic.d <- TryToFitDecayFunction((x - min.x)/scale.x, d.x, d.x[length(d.x)], 0.6) # - min.x to scale to 0 points(x, d.x, pch=17, col=cols[i]) xx <- seq(min(0, min.x), max.x, length.out = 100) lines(xx, DecayFunction((xx - min.x) / scale.x, theoretic.d$d1, theoretic.d$d2, theoretic.d$shiftY), lty = 2, lwd = 2, col = cols[i]) } legend("topright", colnames(means), fill=cols) }) }