% SUPPLEMENTARY MATERIAL
% !Rnw weave = knitr
% -*-coding:latin-1-*-
\RequirePackage{fix-cm}
\documentclass[paper=a4,notitlepage,DIV=12]{scrartcl}
\pdfminorversion=4 % needed to ensure PDF version 1.4 for Journal tools
\usepackage{siunitx}
\usepackage{fixltx2e}
\usepackage{setspace}
\usepackage{array}
\newcolumntype{L}[1]{>{\raggedright\let\newline\\\arraybackslash\hspace{0pt}}m{#1}}
\usepackage{lineno}
\usepackage{amsmath}
\usepackage{amsfonts}
%\usepackage{amsthm}
\usepackage{graphicx}
\usepackage{latexsym}
\usepackage[numbers,sort&compress]{natbib}
\citestyle{nature}
\usepackage{nomencl}
%\usepackage{svninfo}
\usepackage[latin1]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{lmodern}
\usepackage[activate={true,nocompatibility},final,tracking=true,kerning=true,spacing=true,factor=1100,stretch=10,shrink=10]{microtype}
\usepackage{textcomp}
\usepackage{eurosym}
\usepackage{upgreek}
\usepackage{setspace}
\usepackage{gensymb}
\usepackage[colorinlistoftodos,disable]{todonotes} % disable
\usepackage{hyperref}
\newcommand{\g}[1]{\emph{#1}}
\makenomenclature
\let\nc\nomenclature
\newcommand{\piqe}{\left(\pi_0 \ Q \ e^{tQ}\right)_i}
\newcommand{\piqqe}{\left(\pi_0 \ Q^2 \ e^{tQ}\right)_i}
\newcommand{\pie}{\left(\pi_0 \ e^{tQ}\right)_i}
\newcommand{\field}[1]{\mathbb{#1}}
\newcommand{\C}{\field{C}}
\newcommand{\R}{\field{R}}
\newcommand{\N}{\field{N}}
\newcommand{\Q}{\field{Q}}
\newcommand{\A}{\ifmmode{\field{A}} \else{$\field{A}$}\fi}
\renewcommand{\P}{\field{P}}
\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator*{\Var}{Var}
\DeclareMathOperator*{\modell}{Modell}
\DeclareMathOperator*{\rang}{rang}
\DeclareMathOperator*{\Mn}{\ifmmode{{\bf M(n;\R) }}\else{$\bf M(n;\R) $}\fi}
\DeclareMathOperator*{\cof}{cof}
\DeclareMathOperator*{\adj}{Adj}
\DeclareMathOperator*{\tr}{tr}
\DeclareMathOperator*{\EMP}{{\boldmath EMP}}
\graphicspath{{../figures/}}
%% florian added:
%\usepackage[usenames,dvipsnames]{xcolor}
\usepackage{color}
%\newcommand{\todo}[1]{\textbf{\color{red} #1}}
\newcommand{\fm}[1]{{\color{BlueGreen} #1}}
\newcommand{\revised}[1]{{\color{blue} #1}}
\definecolor{gray}{rgb}{0.6,0.6,0.6}
\newcommand{\texttext}{\emph{\color{gray}`Twas brillig, and the slithy
toves/ Did gyre and gimble in the wabe:/ All mimsy were the
borogoves,/ And the mome raths outgrabe./ "Beware the Jabberwock, my
son!/ The jaws that bite, the claws that catch!/ Beware the Jubjub
bird, and shun/ The frumious Bandersnatch!"}}
%% roland added:
\usepackage{rotating}
\usepackage{multirow}
\newcommand{\InsertLandscapeTable}[4]{
\begin{sidewaystable}[!htbp]
\begin{center}
\leavevmode
#4
\caption[#1]{#1 #2}
\label{#3}
\end{center}
\end{sidewaystable}}
\newcommand{\nomargin}{\vspace{-2.7cm}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\title{Supporting Information for Schwarz~et~al,~PLoS~Medicine~2015\\ --Heterogeneity measures and survival analysis--}
\date{\today}
\begin{document}
<>=
opts_chunk$set(concordance=TRUE)
@
\maketitle
%\noindent$^1$ Cancer Research UK Cambridge Institute, Li Ka Shing Centre, Cambridge, CB2 0RE, UK\\
%$^2$ Department of Oncology, University of Cambridge, Hutchison/MRC Research Centre, Hills Road, Cambridge, CB2 0XZ, UK\\
%$^3$ Cambridge University Hospitals NHS Foundation Trust, Cambridge Biomedical Campus, Hills Road, Cambridge, CB2 0QQ, UK\\
%$^4$ University Department of Radiology, Box 218, Level 5, Addenbrooke's Hospital, Hills Road, Cambridge, CB2 2QQ, UK\\
%$^\dagger$ Current affiliations: RFS, European Bioinformatics Institute, Hinxton; CKYN, The Institute for Cancer Research, London; SLC, The Sanger Institute, Hinxton; SN, Department of Human Genetics, Emory University School of Medicine.\\
%\medskip
%\noindent$^*$ To whom correspondence should be addressed: florian.markowetz@cruk.cam.ac.uk,\\ james.brenton@cruk.cam.ac.uk\\
%\noindent Keywords: Intratumoral genetic heterogeneity, high-grade serous ovarian carcinoma, neoadjuvant chemotherapy, biomarker, array comparative genomic hybridization, next-generation sequencing
\thispagestyle{empty}
\tableofcontents
\listoftodos
\clearpage
%%
%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Clinical data}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Data overview}
<>=
lower.triangle=function(M, include.diagonal=FALSE) {
## extracts the lower triangular matrix of a matrix M as a vector (column-wise)
if (!is.matrix(M)) return(NA)
p=c()
if (!include.diagonal) {
for (i in 1:(ncol(M)-1)) {
p=append(p, as.numeric(M[(i+1):nrow(M),i]))
}
} else {
for (i in 1:ncol(M)) {
p=append(p, as.numeric(M[i:nrow(M),i]))
}
}
return(p)
}
@
First we load the data and R packages. The data file is part of the paper supplement, and we have also made a copy available online.
<>=
library(survival)
library(kernlab)
library(rms)
library(spatstat)
library(RColorBrewer)
library(gplots)
## load data file from local copy or from URL
if (file.exists("Schwarz2015-supplement.Rdata")){
load("Schwarz2015-supplement.Rdata")
cat("Data loaded from local copy")
} else {
load(url("http://www.markowetzlab.org/supplements/Schwarz2015-supplement.Rdata"))
cat("Data loaded from URL") }
@
The first object in the .Rdata file is a table \texttt{D} with patient information:
<>=
print(D,digits=3)
attach(D)
@
<>=
## Print the data table in LaTeX format for inclusion into main mansucript
library(xtable)
print(xtable(D),file="TableOverview.tex")
@
Rownames correspond to sample identifiers. Columns indicate the patient numbers used in the manuscript (\texttt{Nr}), as well as values for temporal heterogeneity (\texttt{TH}), clonal heterogeneity index (\texttt{CE}), overall survival in days (\texttt{OS}), progression-free survival in days (\texttt{PFS}) and indicators for survival (\texttt{dead}) and progression (\texttt{prog}) as well as covariates for age, stage (ordered factor), residual disease after debulking surgery (\texttt{residual}, ordered factor) and the number of samples per patient (\texttt{N}).
Case OV03-10 did not have surgical debulking attempted so is NA for residual disease. OV03-24 is shown as NA for patient number because 0/6 samples were left after quality control and were not used in the analysis. Patients with less than 3 samples were not suitable for phylogenetic reconstruction and are removed from later analyses (see below).
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Progression free survival and overall survival}
We have two measures of survival: progression free survival (PFS) and overall survival (OS), which are correlated. Summary statistics for PFS and OS in days are:
<>=
summary(PFS)
summary(OS)
@
We plot PFS and OS for all cases.
<>=
all.pfs.fit<-survfit(Surv(PFS, prog) ~ 1)
all.pfs.fit
all.os.fit<-survfit(Surv(OS, dead) ~ 1)
all.os.fit
par(mfrow=c(1,2))
plot(all.pfs.fit, main="Kaplan-Meier estimate for PFS\nwith 95% confidence bounds",
xlab="time", ylab="survival")
plot(all.os.fit, main="Kaplan-Meier estimate for OS\nwith 95% confidence bounds",
xlab="time", ylab="survival")
@
The estimated median follow-up time can be calculated by the reverse Kaplan-Meier method. We invert the censoring index for death to estimate time to loss of follow up.
<>=
survfit(Surv(OS, !dead) ~ 1)
@
There are very few censoring events for OS so the estimate is NA, but this also means that median OS closely approximates median follow up.
We assume 30 days in a nominal month.
<<>>=
summary(OS)/30
@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\clearpage
\subsection{Correlation between PFS and OS}
<>=
plot(PFS,OS, main="Correlation between survival data")
cor.test(PFS,OS)
@
Therefore, PFS and OS are only weakly correlated.
After this analysis of the complete survival data, we remove the cases with insufficient samples for phylogenetic reconstruction ($ N < 3 $) after quality control (cases OV03-07, OV03-23, OV03-24, OV04-27):
<>=
## Remove samples
D <- D[N>=3,]
detach(D);attach(D)
sum(N)
@
%%
%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\clearpage
\section{Heterogeneity measures}
MEDICC was used to reconstruct evolutionary histories in two separate analyses;
\begin{enumerate}
\item A joint inference, across all samples and all patients, resulting in a single distance matrix \texttt{M}. This has equal numbers of rows and columns to the total number of samples in the study (= \Sexpr{nrow(M)}). Major and minor allele CN were summed to a total CN profile to ensure that the analysis was computationally tractable for inferring evolutionary distances, This distance matrix was used to build the large overview tree in the paper (Fig.~1) and to compute the temporal heterogeneity (TH) index. Maintaining a joint reference frame is necessary to make the TH index comparable between patients.
\item A single inference per patient, using all samples for that particular individual. This results in $17$ individual distance matrices stored in the list object \texttt{Ms}. The length of each index is the number of samples available for each patient. These smaller inferences are used to maintain the separate alleles and to perform allele specific phasing and inference. These distance matrices were used to estimate the clonal expansion (CE) index which is a normalised measure that remains comparable between patients.
\end{enumerate}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Exploratory data analysis}
First we present analysis~1 using the distance matrix \texttt{M} across all samples of all patients.
We reduce \texttt{M} to the \Sexpr{nrow(D)} patients with \texttt{N>3} listed in \texttt{D} \ldots
<<>>=
select <- sapply(strsplit(rownames(M),"_"),function(x)x[1]) %in% rownames(D)
M <- M[select,select]
@
\ldots and remove diploids \ldots
<<>>=
diploid.idx <- grep("diploid",rownames(M))
M <- M[-diploid.idx, -diploid.idx]
@
\ldots which results in data for the \Sexpr{nrow(M)} samples listed in \texttt{D}:
<<>>=
pids <- sapply(strsplit(rownames(M),"_"),function(x)x[1])
patients <- unique(pids)
table(pids)
@
Finally we define colors for each patient and for resistance/sensitivity:
<<>>=
patient.colors <- c(brewer.pal(12,"Paired"), "black", "cyan", "yellow", "green", "magenta")
names(patient.colors) <- patients
groups=factor(ifelse(PFS<=365,"resistant","sensitive"))
names(groups) <- patients
groups.colors <- ifelse(groups=="resistant","red","yellow")
names(groups.colors) <- patients
table(groups)
@
On the following pages we provide three overview plots for exploratory analysis.
\begin{enumerate}
\item matrix \texttt{M} in a heatmap;
\item projection of all samples into 2 dimensions;
\item samples per patient projected in 2D.
\end{enumerate}
\newpage
\subsubsection*{Heatmap of distance matrix}
For a global overview we plot matrix \texttt{M} in a heatmap. The color gradient ranges from white (no differences) to dark red (many differing events).
<>=
heatmap.2(M,
symm = TRUE, Colv=TRUE, scale="none", trace="none",
col=c("white",brewer.pal(9,"Reds")), labCol="",labRow="",
RowSideColors=patient.colors[pids],ColSideColors=groups.colors[pids])
@
The light boxes along the diagonal are the samples belonging to the same patient (also shown as color blocks at the rows of the matrix). The colors above the columns of the matrix indicate whether the sample comes from a patient with resistant or sensitive disease. No obvious clustering is visible.
\newpage
\subsubsection*{Projection of all samples into 2D}
We next plot the projection of all samples into 2 dimensions, using a kernel matrix \texttt{K} computed from \texttt{M}. Again no clustering into resistant or sensitive subgroups is immediately visible.
<>=
pchs <- c(21,25)
names(pchs) <- levels(groups)
K <- exp(-M/300) ## Kernel matrix; rescaling for numerical stability
KPCA <- kpca(as.kernelMatrix(K)) ## Kernel Princ Comp Ana
plot(rotated(KPCA)[,c(1,2)],
bg=patient.colors[pids], col="grey", pch=pchs[groups[pids]],
xlab = "PC 1", ylab = "PC 2", xlim= c(-8,4))
pidcount <- table(pids)
legend("topleft", fill=patient.colors[names(pidcount)],
legend=paste(names(pidcount), " (", pidcount, ")", sep=""))
legend("bottomleft", pch=pchs, legend=c("resistant", "sensitive"))
@
\newpage
\subsubsection*{Samples per patient in 2D}
We also plot the samples per patient projected in 2D. These are the patterns the TH index is computed from in the next section.
<>=
par(mfrow=c(3,5),mar=c(0,0,1,0)+.2)
X <- rotated(KPCA)
for (p in patients){
Xp <- X[which(pids %in% p),,drop=F]
time <- substr(sapply(strsplit(rownames(Xp),"_"), function(c) c[3]),1,1)
plot(Xp, pch=16, col=ifelse(time=="S","orange","lightblue"),
main=p,xaxt="n",yaxt="n",xlab="",ylab="")
}
plot.new()
legend("topleft",pch=19,
legend=c("Surgery","Biopsy"),col=c("orange","lightblue"))
@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\clearpage
\subsection{Summary statistics of overall divergence}
We were mainly interested in two features, (i) the median number of events within all cancer samples of a patient and (ii) the median divergence from the diploid normal per patient:
<>=
div <- sapply(patients, function(p) {
dat <- Ms[[p]]
idx <- grep("diploid", rownames(dat))
nondip <- dat[-idx, -idx]
dip <- dat[ idx, -idx]
return(c(med.div=median(lower.triangle(nondip)),
med.dist.to.dip=median(dip))) })
div
@
Over all patients these values can be summarized:
<<>>=
## Median number of events summarized over all patients
summary(div[1,])
## Median distance to diploid summarized over all patients
summary(div[2,])
@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Computing temporal heterogeneity (TH)}
To compute temporal heterogeneity from samples we need two functions. The first one, \texttt{robust.com()}, computes the \emph{robust centre of mass}, i.e. the mean after excluding the most distant point:
<<>>=
robust.com = function(X) { ## robust center of mass
if (nrow(X)<3) return(apply(X,2,mean))
com <- apply(X,2,mean)
leave.out <- which.max(as.matrix(dist(rbind(com,X)))[1,])-1
return(apply(X[-leave.out,,drop=F],2,mean))
}
@
Next, in function \texttt{computeTH()} we apply \texttt{robust.com()} to the surgery and biopsy samples of each patient. Temporal heterogeneity is then defined as the distance between the two robust centres of mass.
<>=
## the function picks up X, pids, etc from the mother environment.
computeTH <- function(p){
## select patient samples
Xp <- X[which(pids %in% p),,drop=F]
## split into surgery and biopsy samples
time <- substr(sapply(strsplit(rownames(Xp),"_"), function(c) c[3]),1,1)
Xsurgery <- Xp[time=="S",,drop=F]
Xbiopsy <- Xp[time=="B",,drop=F]
## compute distance between robust centres of mass
if (nrow(Xsurgery) * nrow(Xbiopsy) !=0) {
cen.surg <- robust.com(Xsurgery)
cen.biop <- robust.com(Xbiopsy)
a <- cen.surg-cen.biop
return(sqrt(t(a) %*% a))
## if no samples exist in one group, return NA
} else { return(NA) }}
@
Applying function \texttt{computeTH} to all patients we get
<>=
TH.repro <- sapply(patients,computeTH)
TH.repro
## This reproduces the numbers in the paper:
round(TH.repro - D$TH,2)
@
Since the computation of TH is based on a global PCA of \emph{all} patients, the values for each patient can vary slightly if the data for other patients change.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Computing the clonal expansion index (CE)}
To compute the clonal expansion index (CE) we use the individual distance matrices \texttt{Ms} which were based on the more accurate allele-specific distance estimates.
As the CE index is based on a permutation test on the spatial distribution of points (Ripley's K), we repeat the analysis \texttt{n} times and average
over the resulting value to get a more robust estimate.
<>=
computeCE <- function(p, n=30){
## format distance matrix
mydist = Ms[[p]]
if (any(is.na(mydist))) { return(NA) }
dip.idx = grep("diploid",rownames(mydist))
mydist = mydist[-dip.idx,-dip.idx]
## Kernel PCA
Kdist = as.kernelMatrix(exp(-mydist/max(mydist)))
KPCA=rotated(kpca(Kdist))
## compute envelope, nsim=19 is 5% alpha level according to
## http://www.csiro.au/files/files/p10ib.pdf
f <- ifelse(nrow(Kdist)<4,2,1.3/sqrt(1-4/nrow(Kdist)))
x <- KPCA[,1]
y <- KPCA[,2]
PPP=suppressWarnings(ppp(x,y,window=ripras(x,y,shape="rectangle",f=f)))
if (n>=1) {
lstat.replic=sapply(1:n,function(x) {
ENV=suppressWarnings(spatstat::envelope(PPP, Lest, nsim=19, nrank=1,
correction="best", global=T, verbose=F))
obstheo= abs(ENV$obs-ENV$theo) ## distance observed <-> theoretical
hitheo = abs(ENV$hi-ENV$theo) ## width of confidence bound around theoretical
maxidx = which.max(obstheo)
lstat = obstheo[maxidx] / hitheo[maxidx]
return(lstat)
})
CE = mean(lstat.replic)
}
return(CE)
}
@
Applying this function to all patients we get:
<>=
set.seed(10000) ## seed set to ensure reproducibility
CE.repro <- sapply(patients,computeCE)
CE.repro
## This reproduces the numbers in the paper (up to resampling)
round(CE.repro - D$CE,2)
@
The computation of CE is an average over \texttt{n=30} runs and the differences we observe are due to the variability in resampling.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Correlation between heterogeneity measures}
We now have two measures of intratumoural heterogeneity: clonal expansion (CE) and temporal heterogeneity (TH) which we test for correlation:
<>=
summary(TH)
summary(CE)
plot(TH,CE, main="Correlation between CE and TH")
cor.test(TH,CE)
@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Heterogeneity and number of samples}
We next explored how the heterogeneity indices are correlated with the number of samples per patient.
<>=
par(mfrow=c(1,2))
plot(N,TH,main="TH versus sample size")
plot(N,CE,main="CE versus sample size")
cor.test(N,TH)
cor.test(N,CE)
@
TH and N do not show a significant correlation. The moderate positive correlation between N and CE is not unexpected, because larger trees can more easily show complex substructures.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Heterogeneity and clinical variables}
<>=
par(mfrow=c(2,2))
boxplot(CE ~ Stage,main="CE vs Stage",xlab="Stage",ylab="CE")
boxplot(TH ~ Stage,main="TH vs Stage",xlab="Stage",ylab="TH")
boxplot(CE ~ residual,main="CE vs Residual",xlab="Residual disease",ylab="CE")
boxplot(TH ~ residual,main="TH vs Residual",xlab="Residual disease",ylab="TH")
@
\noindent
The plots show some trends but the boxplots largely overlap and the sample size does not allow strong conclusions.
%%
%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\clearpage
\section{Exploratory survival analysis}
It has been proposed that for a tumour to overcome selection pressure by chemotherapy it needs to be able to efficiently explore the mutational landscape \citep{Greaves2012}.
Therefore, we hypothesized that polygenomic tumours with a high mutation rate that have already undergone clonal expansions are likely to be at an advantage.
We investigated if our measures of heterogeneity, the TH and CE indices, could predict progression-free survival (PFS) and overall survival (OS). Here, we summarize our main results before reproducing them on the data in the next sections. Because of the small sample size these are only exploratory analyses.
\paragraph{Survival differences between low-CE and high-CE}
We used the log-rank test for survival \citep{Harrington1982} to investigate differences between the CE-low and CE-high subgroups for both PFS (section~\ref{medianCE-PFS}) and OS (section~\ref{medianCE-OS}).
Survival was shorter in patients in the CE-high category (PFS 12.7 vs 10.1 months; $P = 0.009$; OS 42.6 vs 23.5 months; $P = 0.003$).
CE-high was an independent predictor of survival in a multivariate Cox-hazard regression analysis which included age, stage and residual disease after debulking surgery (PFS, $P=0.001$; OS, $P=0.004$; section~\ref{multCox}).
Investigating survival differences between patients with low or high temporal heterogeneity (TH index) did not show significant differences (section~\ref{medianTH}).
\paragraph{Continuous CE and survival}
These analyses used an unbiased cutpoint but in general the use of a categorical variable has reduced statistical power and may not be the most clinically relevant method of dividing patients.
We therefore tested CE as a continuous variable in a Cox proportional hazard model which assumes a linear relationship between CE and survival (section~\ref{contCE}).
In univariate analysis, the quantitative CE-index had a borderline significant association with overall survival (OS) with $\mathrm{HR} = 2.7$ (95\% CI 0.96, 7.8; $ P = 0.06 $) but no significant association with progression free survival (PFS).
In multivariate models that considered CE, age, stage, and residual disease, CE as a quantitative variable was not significantly associated with OS or PFS (coefficient $ P = 0.64 $ and $ P = 0.76 $ respectively).
We also considered that the relationship between CE and survival might not be linear and the relative hazard might change drastically for some values of CE while staying stable for others.
We therefore examined potential non-linear effects CE on survival using spline smoothing, a standard technique in survival analysis.
For PFS and OS, the relationship between CE and relative hazard was non-linear showing a step function effect with marked increase in hazard seen at CE values greater than \numrange{0.7}{0.8} supporting the relevance of using the median as a cut point.
To further determine if the hazard ratio for CE-high was likely to be $>1$ (deleterious for outcome), we performed tests of robustness using bootstrap analysis with 10,000-fold resampling.
The differences in PFS and OS remained significant ($ P < 0.05 $) in $82$\% and $92$\% respectively of the perturbed datasets.
The bootstrapped derived median values hazard ratios for PFS and OS were $ \mathrm{HR} = 7.1 $ (95\% CI > 1) and $ \mathrm{HR} = 11.4 $ (95\% CI > 1) respectively.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Clonal expansion index and survival}
\label{medianCE}
We use the median clonal expansion value to split patients into low-CE (factor \texttt{CEhigh=0}) and high-CE (factor \texttt{CEhigh=1}) subgroups:
<>=
cutpointCE <- quantile(CE, 0.5, na.rm=TRUE)
D$CEhigh <- 1 * (CE > cutpointCE)
detach(D); attach(D)
@
\noindent
We plot a histogram of clonal expansion values with median indicated as red line.
<>=
hist(CE,14,col="grey",main="Clonal expansion index")
abline(v=cutpointCE,col="red",lwd=2)
@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{CE stratification and PFS}
\label{medianCE-PFS}
The median \textbf{progression free survival} (PFS) differed between the low-CE and high-CE subgroups.
<<>>=
fit <- npsurv(Surv(PFS,prog) ~ CEhigh)
@
We test the significance of this survival difference using the log-rank test.
<>=
res1 <- survdiff(Surv(PFS,prog) ~ CEhigh)
p1 <- round(1-pchisq(res1$chisq,1),3)
res1
ntab = table(D$CEhigh)
txtlow = sprintf("CE low (n=%d)", ntab[1])
txthigh = sprintf("CE high (n=%d)", ntab[2])
survplot(fit, conf="none", n.risk=TRUE, col=c("black", "red"), xlab="Progression free survival [days]", lwd=3,las=1, lty=1, cex=0.8)
@
\paragraph{Bootstrap analysis}
To check the robustness of our results, we used bootstrap analysis. We sampled the data with replacement 10000 times and redid the survival analysis as above:
<>=
set.seed(43) ## seed set to make the result reproducible
Nboot <- 10000
resB1a <- replicate(Nboot,
survdiff(Surv(PFS,prog) ~ CEhigh,
subset=sample(1:nrow(D),replace=TRUE))$chisq)
pB1a <- 1-pchisq(resB1a,1)
mean( pB1a < 0.05 )
@
Therefore, using a log-rank test the result stays significant in \Sexpr{round(mean(pB1a < 0.05),2)*100}\% of perturbed data sets.
<>=
resB1b <- replicate(Nboot,
coxph(Surv(PFS,prog) ~ CEhigh,
subset=sample(1:nrow(D),replace=TRUE))$coef)
round(quantile( resB1b, c(.025,.5,.975)),2)
exp(quantile( resB1b, c(.025,.5,.975)))
@
This means: Using a Cox proportional hazard model we found a bootstrap median coefficient of \Sexpr{round(quantile(resB1b,.5),2)} and a 95\% bootstrap confidence interval of (\Sexpr{round(quantile(resB1b,.025),2)}, \Sexpr{round(quantile(resB1b,.975),2)}), which does not overlap with $0$.
The hazard ratio has a bootstrap median of \Sexpr{round(exp(quantile(resB1b,.5)),2)} and a 95\% bootstrap confidence interval that does not overlap with $1$.
The quantile method we used here is conservative.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{CE stratification and OS}
\label{medianCE-OS}
The median \textbf{overall survival} (OS) differed between the low-CE and high-CE subgroups.
<<>>=
fit <- npsurv(Surv(OS,dead) ~ CEhigh)
@
We test the significance of this survival difference using the log-rank test.
<>=
res2 <- survdiff(Surv(OS,dead) ~ CEhigh)
p <- round(1-pchisq(res2$chisq,1),3)
res2
survplot(fit, conf="none", n.risk=TRUE, col=c("black", "red"), xlab="Overall survival [days]", lwd=3,las=1, lty=1, cex=0.8)
@
We estimate the univariate hazard ratio for CEhigh on OS using a Cox proportional hazards model
<>=
summary(coxph(Surv(OS, dead) ~ CEhigh))
@
\paragraph{Bootstrap analysis}
We checked the robustness of our results using bootstrap analysis with $\Sexpr{Nboot}$ replicates:
<>=
set.seed(43) ## seed set to make the result reproducible
resB2a <- replicate(Nboot,
survdiff(Surv(OS,dead) ~ CEhigh,
subset=sample(1:nrow(D),replace=TRUE))$chisq)
pB2a <- 1-pchisq(resB2a,1)
mean( pB2a < 0.05 )
@
Therefore, using a log-rank test the result stays significant in \Sexpr{round(mean(pB2a < 0.05),2)*100}\% of perturbed data sets.
<>=
resB2b <- replicate(Nboot,
coxph(Surv(OS,dead) ~ CEhigh,
subset=sample(1:nrow(D),replace=TRUE))$coef)
round(quantile( resB2b, c(.025,.5,.975)),2)
exp(quantile( resB2b, c(.025,.5,.975)))
@
This means: Using a Cox proportional hazard model we found a bootstrap median coefficient of \Sexpr{round(quantile(resB2b,.5),2)} and a 95\% bootstrap confidence interval of (\Sexpr{round(quantile(resB2b,.025),2)}, \Sexpr{round(quantile(resB2b,.975),2)}), which does not overlap with $0$.
The hazard ratio has a bootstrap median of \Sexpr{round(exp(quantile(resB2b,.5)),2)} and a 95\% bootstrap confidence interval that does not overlap with $1$.
The quantile method we used here is conservative.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Multivariate Cox-hazard model}
\label{multCox}
The univariate analysis above showed significant differences in survival curves for patients stratified by the clonal expansion index.
Here we use multivariate Cox-hazard regression to assess if this result holds up when corrected for the number of samples per patient and classical prognostic factors: age, stage and residual disease after debulking surgery.
In the following analysis we use the \texttt{anova()} function to produce a sequential analysis of deviance table for the fitted model. Variables are added sequentially to the model in the order \texttt{Age}, \texttt{N}, \texttt{Stage}, \texttt{residual} and finally \texttt{CEhigh}. The p-value shows how significantly each variable contributes given the previously added variables.
<<>>=
## Cox-hazard model for PFS
anova(coxph(Surv(PFS,prog) ~ Age + N + Stage + residual + CEhigh))
## Cox-hazard model for OS
anova(coxph(Surv(OS,dead) ~ Age + N + Stage + residual + CEhigh))
@
\noindent
In summary, stratifying patients by the clonal expansion index is a significant predictor of both PFS and OS, independent of covariates including age and the number of samples per patient.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\newpage
\subsubsection{Survival analysis with continuous CE}
\label{contCE}
%The following code chunk reproduces Reviewer~1's analysis of our data, which we copied her for full transparency:
<>=
## TEMPORAL - nothing significant
summary(coxph(Surv(OS,dead)~TH))
summary(coxph(Surv(OS,dead)~ Age + N + Stage + residual + TH))
summary(coxph(Surv(PFS,prog)~TH))
summary(coxph(Surv(PFS,prog)~Age + N + Stage + residual + TH))
## CLONAL EXPANSION
summary(coxph(Surv(OS,dead)~CE))
summary(coxph(Surv(OS,dead)~ Age + N + Stage + residual + CE))
summary(coxph(Surv(PFS,prog)~CE))
summary(coxph(Surv(PFS,prog)~Age + N + Stage + residual + CE))
@
We additionally performed survival analysis with continuous CE values (instead of the CE-high and -low subgroups as above). Again, we use the \texttt{anova()} function to produce a sequential analysis of deviance table for the fitted model.
\paragraph{Linear model}
Directly using the continuous CE values in a Cox proportional hazard model assumes a linear relationship between CE and log-hazard.
However, for PFS we find no evidence for a linear relationship in our data:
<<>>=
## Univariate analysis
anova(coxph(Surv(PFS,prog)~CE))
## Multivariate analysis
anova(coxph(Surv(PFS,prog)~Age + N + Stage + residual + CE))
@
For OS we found more evidence for a linear relationship than for PFS and the univariate p-value is almost significant:
<<>>=
## Univariate analysis
anova(coxph(Surv(OS,dead)~CE))
summary(coxph(Surv(OS,dead)~CE))
@
However in a multivariate setting the linear relationship is no longer significant:
<<>>=
## Multivariate analysis
anova(coxph(Surv(OS,dead)~ Age + N + Stage + residual + CE))
@
For both PFS and OS, we plot CE versus relative hazard (using functions from the \texttt{rms} package).
<>=
d <- datadist(D)
options(datadist="d")
## PFS Cox model using rms functions
tmp <- cph(Surv(PFS,prog)~CE)
ylim <- list(y=list(limit=c(-1,2)))
p1 <- plot(Predict(tmp,CE=NA),main="PFS",scales=ylim)
## OS Cox model using rms functions
tmp <- cph(Surv(OS,dead)~CE)
p2 <- plot(Predict(tmp,CE=NA),main="OS",scales=ylim)
## plotting side by side
print(p1, position = c(0, 0, 0.5, 1), more = TRUE)
print(p2, position = c(0.5, 0, 1, 1))
@
The linear models are almost flat. For comparison with the nonlinear models in the next sections, note the small range of relative hazard on the y-axis.
\paragraph{Spline model}
However, we find evidence that the relationship between CE and log-hazard is non-linear. The following analysis uses cubic splines, a standard tool in survival analysis. To reduce overfitting we fixed the degrees of freedom to 3, which is smaller than the default setting (\texttt{df=4}):
<<>>=
## Univariate analysis
anova(coxph(Surv(PFS,prog)~pspline(CE,df=3)))
## Multivariate analysis
anova(coxph(Surv(PFS,prog)~Age + N + Stage + residual + pspline(CE,df=3)))
@
Also for OS we find evidence for a nonlinear relationship, but it is less pronounced:
<<>>=
## Univariate analysis
anova(coxph(Surv(OS,dead)~pspline(CE,df=3)))
## Multivariate analysis
anova(coxph(Surv(OS,dead)~Age + N + Stage + residual + pspline(CE,df=3)))
@
We plot the non-linear relationship between CE and the relative hazard we observed in our data (using functions from the \texttt{rms} package):
<>=
## PFS Cox model using rms functions
tmp <- cph(Surv(PFS,prog)~rcs(CE))
ylim <- list(y=list(limit=c(-4,12)))
p3 <- plot(Predict(tmp,CE=NA),main="PFS",scales=ylim)
## OS Cox model using rms functions
tmp <- cph(Surv(OS,dead)~rcs(CE))
p4 <- plot(Predict(tmp,CE=NA),main="OS",scales=ylim)
## plotting side by side
print(p3, position = c(0, 0, 0.5, 1), more = TRUE)
print(p4, position = c(0.5, 0, 1, 1))
@
As these plots show, the cutoff point we chose (\texttt{median(CE)}=\Sexpr{round(cutpointCE,2)}) lies in a region where the log relative hazard rapidly changes and discriminates low hazard from high hazard. This behaviour is most pronounced in the PFS plot on the left.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\clearpage
\subsection{Temporal heterogeneity and survival}
\label{medianTH}
We repeated the same analysis with a measure of temporal heterogeneity.
Again, we use the median value to split patients into low-TH (factor \texttt{THhigh=0}) and high-TH (factor \texttt{THhigh=1}) subgroups:
<>=
cutpointTH <- quantile(TH, 0.5, na.rm=TRUE)
D$THhigh <- 1 * (TH > cutpointTH)
detach(D); attach(D)
@
\noindent
We plot a histogram of clonal expansion values with median indicated as red line.
<>=
hist(TH,14,col="grey",main="Temporal heterogeneity")
abline(v=cutpointTH,col="red",lwd=2)
@
We test for differences in \textbf{progression free survival} (PFS) in the low TH and high TH subgroups:
<<>>=
survdiff(Surv(PFS,prog) ~ THhigh)
@
We test for differences in \textbf{overall survival} (OS) in the low-heterogeneity and high-heterogeneity subgroups:
<<>>=
survdiff(Surv(OS,dead) ~ THhigh)
@
Therefore, no significant differences were found for the overall survival and progression-free survival of patients stratified by temporal heterogeneity.
%%
%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Testing features of the evolutionary process}
To further understand features of tumor evolution underlying the survival differences, we used phylogenetic techniques:
\begin{enumerate}
\item We applied a test for \textbf{molecular clock}. This ascertains the right distance method (Fitch-Margoliash with or without clock assumption) was used to reconstruct the individual patient-specific trees.
\item We further tested each patient for the null hypothesis of a \textbf{star topology} as opposed to branched clonal evolution.
\end{enumerate}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Test for molecular clock}
<<>>=
clock.test <- function(p) {
Dtest = Ms[[p]]
suppressWarnings(if (is.na(Dtest)) return(NA))
## extract distances from diploid
idx=grep("diploid", rownames(Dtest))
p=Dtest[idx,-idx]
## distances from diploid to all others but diploid
## scale the residuals by the square root of their variance (their sd)
## the estimates p are normal distributed with mean t (time of divergence) and variance t.
res=(mean(p) - p) / sqrt(p)
res= t(res) %*% res
## the sum of squared residuals is therefore chisquare distributed with |p|-|samples| degrees
## of freedom. The "-|samples|" is because we optimize over the branch lengths
prob=pchisq(res, df=length(p))
return(1-prob) ## the p-value
}
@
Applying the \texttt{clock.test()} to all patients we get:
<<>>=
clock.pvals <- sapply(patients, clock.test)
round(clock.pvals,3)
@
And adjusting for multiple testing:
<<>>=
clock.pvals.adjust <- p.adjust(clock.pvals,"BH")
round(clock.pvals.adjust,3)
table(clock.pvals.adjust < 0.05)
@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Test for star topology}
We further tested the distance matrix for each patient against the null hypothesis of the distances being derived from an evolutionary star topology. All but one patients showed significant deviation from a star topology, i.e. they exhibited branched clonal evolution.
<<>>=
star.test=function(p) {
Dtest = Ms[[p]]
suppressWarnings(if (is.na(Dtest)) return(NA))
if (nrow(Dtest)<3) return(NA) ## no tree, no p-value
## extract lower triangle matrix (function defined in .Rnw)
p=lower.triangle(Dtest)
## set up |p| x |samples| matrix T of zeros and ones mapping branch lengths
## in a star topology setting to the pairwise distances (e.g. dist(a,b) = branch_a + branch_b)
count=1
T = matrix(data=0, nrow=length(p), ncol=nrow(Dtest))
for (i in 1:(ncol(T)-1)) {
for (j in (i+1):ncol(T)) {
T[count,c(i,j)]=1
count = count+1
}
}
## now find the optimal branch lengths b for our p
Tinv=solve(t(T)%*%T)
bopt=t(t(p) %*% T %*% Tinv)
## scale the residuals by the square root of their variance (their sd)
## the estimates p are normal distributed with mean t (time of divergence) and variance t.
res=(T %*% bopt - p) / sqrt(p)
res= t(res) %*% res
## the sum of squared residuals is therefore chisquare distributed with |p|-|samples| degrees
## of freedom. The "-|samples|" is because we optimize over the branch lengths
prob=pchisq(res, df=length(p)-ncol(Dtest))
return(1-prob) ## the p-value
}
@
Applying the \texttt{star.test()} to all patients we get:
<<>>=
star.pvals <- sapply(patients, star.test)
round(star.pvals,3)
@
And adjusting for multiple testing:
<<>>=
star.pvals.adjust <- p.adjust(star.pvals,"BH")
round(star.pvals.adjust,3)
@
Patients with only 3 samples will never be significantly different from a star topology as three taxa always form a star.
When investigating those patients \texttt{N > 3} samples we find:
<<>>=
table(star.pvals.adjust[D[names(star.pvals.adjust),"N"]>3] < 0.05)
@
%%
%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\clearpage
\section{Session Info}
<<>>=
sessionInfo()
@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% References
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\clearpage
%\bibliographystyle{../manuscript/myvancouver}
%\bibliography{../manuscript/cnv-distance}
\begin{thebibliography}{1}
\bibitem{Greaves2012}
Greaves M, Maley CC.
\newblock {C}lonal evolution in cancer.
\newblock \emph{Nature} 2012;\hspace{0pt}481:306--13.
\bibitem{Harrington1982}
Harrington DP, Fleming TR.
\newblock {A} class of rank test procedures for censored survival data.
\newblock \emph{Biometrika} 1982;\hspace{0pt}69:553--566.
\end{thebibliography}
<>=
detach(D)
@
\end{document}