# Multivariate random effects functions # supporting paper: Aitken, C.G.G. & Lucy, D. (2003) Applied Statistics # all functions written for R - some adaptation may be necessary to get # them to run in S # # Function list: # # calculate.UC() # y1.y2.means() # calculate.h.opt() # minu() # MVRA.density() # MVRA.normal() # # All these functions are documented within the function code ######################################################################### # calculates U which is the measure of between fragment variability # for the whole population of items, and C which is the inter group # variability - also gets the means of the measurements for each group # # a critical restriction at the moment is that each item must have # the same number of measurements on it - eg from windows each window # must have the same number of fragments examined from it # ######################################################################### # # population - an m x n matrix of an itemindicator and replicated # measurements of isotopic or trace element concentrations # or some other multivariate suite of measurements # # variables - a vector which indicate which columns in population are # the the actual measurements - all these must be in # the same order as all the other functions # # itemindicator - a scaler indicating the column occupied by the variable # which indicates which item the row of measurements came # from ######################################################################### calculate.UC <- function(population, variables, item.indicator) { # define how many unique items are in the population items <- unique(population[,item.indicator]) no.items <- length(items) # pick out the number of measurables no.variables <- length(variables) # check to see if all the items have equal numbers of measurements measurements <- population[,item.indicator] measurements.per.item <- unique(table(measurements)) if(length(measurements.per.item) > 1){stop("Not all items have the same number of measurements")} # this check can go when the code is generalised to unequal numbers # of measurements in the population data # define the output matricies #Sw <- matrix(rep(0, no.variables^2), nrow=no.variables) Sw <- matrix(0, ncol=no.variables, nrow=no.variables) S <- Sw # get a matrix of measurement means for all measurements in the population all.means <- matrix(apply(population[,variables], 2, mean), nrow=1) # dummy within group means group.means <- matrix(0, nrow=no.items, ncol=no.variables) # for each unique item for(ctr in 1:no.items) { # define tempoary matrix ans <- matrix(rep(0, no.variables^2), nrow=no.variables) # which rows refer to repeated measurements on the same item repeats <- which(population[,item.indicator] == items[ctr]) # pick out the measurements for the item tempdat <- as.matrix(population[repeats, variables]) # get the variable means means <- matrix(apply(tempdat, 2, mean), nrow=1) # assign values to the group means for the population group.means[ctr,] <- means # for each set of measurements within an item for(ctr1 in 1:nrow(tempdat)) { # this is the (xij - xibar) %*% t(xij - xibar) bit # xij - xibar forms a component item.measurements.minus.means <- tempdat[ctr1,] - means ans <- ans + t(item.measurements.minus.means) %*% item.measurements.minus.means } # sum the within and between item means S <- S + t(means - all.means) %*% (means - all.means) # sum for all measurements Sw <- Sw + ans } # convert matrix Sw to matrix U U <- Sw / ((no.items * measurements.per.item) - no.items) # convert matrix S to matrix C C <- (S / (no.items - 1) ) - (U / measurements.per.item) return(U, C, group.means, all.means) } ######################################################################### # calculates the difference between the mean of the suspect fragments # and the mean of the recovered fragments for each variable # # REQUIRES # # control.data - those records from the control fragments # # control.variables - a vector of integers indicating which # columns in the data are the measurements # they must indicate the same order as # the population and suspect vectors # that is if the variables are Pb and As in # columns 3 and 4 in this frame and 6 and 5 in the # population frame then they must be listed in # respective orders # # recovered.data - those records from the suspect fragments # # recovered.variables - a vector of integers indicating which columns # in the data frame are the actual measurements # all the caveats about ordering apply to this # input # # RETURNS # # recovered.mean vector of means for the recovered measurements # # control.mean vector of means for the control measurements # # Nc - number of measurements from the crimescene (control) # # Nr - number of measurements from the suspect (recovered) ######################################################################### y1.y2.means <- function(control.data, control.variables, recovered.data, recovered.variables) { # pull out the actual measurements and put each of them in a # frame to stop R getting confused over vector / matrix recovered <- data.frame(recovered.data[,recovered.variables]) control <- data.frame(control.data[,control.variables]) # get vectors of the means of the variables recovered.mean <- apply(recovered, 2, mean) control.mean <- apply(control, 2, mean) # find out how many measurements have been taken for each Nc <- nrow(control) Nr <- nrow(recovered) return(recovered.mean, control.mean, Nc, Nr) } ########################################################################## # calculates h.opt from the number of groups and the number of variables # # REQUIRES # # n.groups - number of items from which replicate measurements have been # taken # # n.variables - number of measurements made on each of the items # # RETURNS # # an estimate of h.opt ########################################################################## calculate.h.opt <- function(n.groups, n.variables){return(((4/((2 * n.variables) + 1)) ^ (1 / (n.variables + 4))) * (n.groups ^ (-(1/(n.variables + 4)))))} ########################################################################## # calculates the difference between two numbers # intended for use as an apply() functionette # # x is a matrix from whose rows we wish to subtract y # # y is a vector ########################################################################## minu <- function(x, y){y - x} ########################################################################## # calculates the likelihood ratio for a mult-variate random effects # density model - further documentation is follows after the function # code # # This function could still do with being made a bit quicker # I have tried using apply() type formulations where appropriate # but they are slightly slower than the counted iterations - # I suspect the fact that they are dealing with matrix operations # has something to do with it # # we're using the prod(eigen(A)$values) form rather than det(A) # as it seems to be a little more reliable as some of the matricies # tend to singularity # # # REQUIRES # # control.mean - vector of variable means for the control fragment # see function y1.y2.means() # # recovered.mean - vector of variable means for the control fragment # see function y1.y2.means() # # Nc - number of replicates for the control fragment - see function y1.y2.means() # # Nr - number of replicates for the recovered fragment - see function y1.y2.means() # # variables - a vector of integers indicating which columns in the data # frames are the measurements # # U - the matrix U calculated from the function calculate.UC() # # C - the matrix C calculated from the function calculate.UC() # # group.means - a vector of means for measurements for each group # calculated by function calculate.UC() # # # # # RETURNS # # value - an estimate of the likelihood ratio ########################################################################## MVRA.density <- function(control.mean, recovered.mean, Nc, Nr, variables, U, C, group.means, h.opt) { n.variables <- length(variables) n.groups <- nrow(group.means) D.control <- U / Nc D.recovered <- U / Nr # a sort of way of testing to see whether the matricies U and C # are singular - a better way may be to look at condition numbers # this bit still in development if(! is.numeric(try(solve(U)))){value <- "NA"; return(value)} #if(! is.numeric(try(solve(C)))){value <- "NA"; return(value)} # component way of getting the inverses of the D matricies U.inv <- solve(U) inv.D.control <- U.inv * Nc inv.D.recovered <- U.inv * Nr control.minus.recovered <- control.mean - recovered.mean A <- inv.D.control + inv.D.recovered # reasonably clever way of calculating the inverse of A inv.A <- U / (Nc + Nr) #assign("A", A, .GlobalEnv) # useful bit of debug code y.star <- inv.A %*% ((inv.D.control %*% control.mean) + (inv.D.recovered %*% recovered.mean)) ########################################################################## top1 <- sqrt(prod(eigen(C)$values)) top2 <- n.groups * (h.opt ^ n.variables) inv.h.opt.squared.times.C <- solve((h.opt^2) * C) top3 <- 1 / sqrt(prod(eigen(A + inv.h.opt.squared.times.C)$values)) top4 <- exp(-0.5 * t(control.minus.recovered) %*% solve(D.control + D.recovered) %*% control.minus.recovered) matt1 <- rep(0, n.groups) # this invocation of minu makes a little difference to speed of execution y.star.minus.mean <- t(apply(group.means, 1, minu, y=y.star)) numerator.constant <- solve(inv.A + (h.opt^2) * C) for(ctr in 1:n.groups){matt1[ctr] <- exp(- 0.5 * t(y.star.minus.mean[ctr,]) %*% numerator.constant %*% y.star.minus.mean[ctr,])} top5 <- sum(matt1) numerator <- prod(top1, top2, top3, top4, top5) ########################################################################## ########################################################################## bot1 <- 1 / sqrt(prod(eigen(inv.D.control + inv.h.opt.squared.times.C)$values)) matt2 <- rep(0, n.groups) control.konstant <- solve(D.control + ((h.opt ^ 2) * C)) for(ctr in 1:n.groups){matt2[ctr] <- exp(-0.5 * t(control.mean - group.means[ctr,]) %*% control.konstant %*% (control.mean - group.means[ctr,]))} ######################################################################### ## the matrix parts can equally ## well be done as the following apply() type ## of formulation - unfortunately despite its cleverness ## it is slower than the iterative approach # ma2 <- t(apply(group.means, 1, minu, y=control.mean)) # konstant <- solve(D.control + ((h.opt ^ 2) * C)) # ma3 <- apply(ma2, 1, mulp, y=konstant) # ma3 <- exp(-0.5 * ma3) ## ma3 is now equal to matt2 ######################################################################### bot2 <- sum(matt2) bot3 <- 1 / sqrt(prod(eigen(inv.D.recovered + inv.h.opt.squared.times.C)$values)) matt3 <- rep(0, n.groups) recovered.konstant <- solve(D.recovered + ((h.opt ^ 2) * C)) for(ctr in 1:n.groups){matt3[ctr] <- exp(-0.5 * t(recovered.mean - group.means[ctr,]) %*% recovered.konstant %*% (recovered.mean - group.means[ctr,]))} bot4 <- sum(matt3) denomonator <- prod(bot1, bot2, bot3, bot4) value <- numerator/denomonator ########################################################################## return(value) } ################################################################################################################### ######## DOCUMENTATION FOR THIS FUNCTION - LATEX SOURCE COPY AND PASTE ############################################ ######## DON'T FORGET TO REMOVE THE R COMMENT INDICATORS ############################################ ################################################################################################################### #\documentclass[12pt, a4paper]{article} #\pagestyle{empty} #\usepackage{portland} #\setlength{\parindent}{0in} #% convert from .dvi to .ps by #%dvips -t landscape formula.dvi -o formula.ps #% then print the .ps #\begin{document} #\landscape #{\Large {\bf MVRA formula}}\\ #\large #If there is a class of $m$ objects or groups which have a range of $p$ characteristics $x$ each of which can be measured $n$ times on a continuous scale within the groups, then denote the index of $m$ as $i$ so $i = \{1,2, \ldots m\}$, that of $n$ as $j$, so $j = \{1,2, \ldots n\}$, so there are $mn$ sets of measurements, and $mnp$ measurements.\\ #Suppose there are two sets, one of $n_{c}$, one of $n_{r}$ measurements on the $p$ characteristics and a comparison between the two sets is required. Let $\bar{y}_{1}$ be a vector of means of the $n_{c}$ measurements from the first object and $\bar{y}_{2}$ be a vector of means of the $n_{r}$ measurements from the second object. Let the first set be denoted as the {\bf control} set, and could be a set recovered at the scene of a crime. Let the second set be called the {\bf recovered} set; this could be a set found associated with a suspect. The two objects are indexed by $l$ where $l = \{1,2\}$.\\ #Each member of the class of objects will have a vector of $p$ means from the $n$ measurements, taken from that member. Denote any of these as $\bar{x}_{i}$. Similary denote each vector of $p$ measurements as $x_{ij}$. so that $\bar{x}_{i} = 1/n \sum^{n}_{j=1}x_{ij}$.\\ #The within group varience can be estimated as: #\begin{displaymath} #\hat{U} = \frac{S_{w}}{nm-m} \hspace{10mm} \mathrm{where}\hspace{10mm} S_{w}=\displaystyle \sum_{i=1}^{m} \sum_{j=1}^{n}(x_{ij} - \bar{x}_{i})(x_{ij} - \bar{x}_{i})^{T} #\end{displaymath} #If there are $n_{c}$ replicates in the control sample and $n_{r}$ replicates in the recovered sample then: #\begin{displaymath} #D_{1}=\frac{\hat{U}}{n_{c}} \hspace{4mm}\mathrm{and}\hspace{4mm} D_{2}=\frac{\hat{U}}{n_{r}} #\end{displaymath}\\ #The between groups variance can be estimated as: #\begin{displaymath} #\hat{C} = \frac{S^{*}}{m-1} \hspace{10mm} \mathrm{where}\hspace{10mm} S^{*}=\displaystyle \sum_{i=1}^{m} (\bar{x}_{i} - \bar{x})(\bar{x}_{i} - \bar{x})^{T} #\end{displaymath} #with\footnote{use the identity: #\begin{displaymath} #A^{-1} = \left[ \left( \frac{\hat{U}}{n_{c}} \right)^{-1} + \left( \frac{\hat{U}}{n_{c}} \right]^{-1} \right]^{-1} = \frac{\hat{U}}{n_{c} + n_{r}} #\end{displaymath} #}: #\begin{displaymath} #A = \left( \frac{\hat{U}}{n_{c}} \right)^{-1} + \left( \frac{\hat{U}}{n_{r}} \right) ^{-1} #\end{displaymath} #and: #\begin{displaymath} #y^{*} = (D_{1}^{-1} + D_{2}^{-1})(D_{1}^{-1} \bar{y}_{1} + D_{2}^{-1} \bar{y}_{2}) #\end{displaymath}\\ #The window smoothing parameter is estimated as $\hat{h}$ where: #\begin{displaymath} #\hat{h} = h_{\mathrm{opt}} = \left( \frac{4}{2p+1} \right)^{\frac{1}{p+4}} \frac{1}{m^{p+4}} #\end{displaymath}\\ #Then the likelihood ratio $\mathbf{V}$ is:\\ #\begin{displaymath} #\mathbf{V}=\frac #{ \displaystyle #\sqrt{\mid \hat{C} \mid} \;\; #\frac{1}{(m\hat{h}^p)} \;\; #\frac{1}{\sqrt{\mid A + (h^2 \hat{C})^{-1} \mid}} \;\; #e^{-\frac{1}{2}(\bar{y}_1 - \bar{y}_2)^T (D_1 + D_2)^{-1}(\bar{y}_1 - \bar{y}_2)} \;\; #\sum_{i=1}^{m} #e^{-\frac{1}{2}(y^* - \bar{x}_i)^T (A^{-1} + (\hat{h}^2 \hat{C}))^{-1} ( y^* - \bar{x}_i)} #} #{ \displaystyle #\prod_{l=1}^{2} \left[ #\frac{1}{\sqrt{\mid D_l^{-1} + (\hat{h}^2 \hat{C})^{-1}\mid}} \;\; #\sum_{i=1}^{m} #e^{-\frac{1}{2}(\bar{y}_l - \bar{x}_i)^T (D_l + \hat{h}^2 \hat{C})^{-1} ( \bar{y}_l - \bar{x}_i)} #\right] #} #\end{displaymath} #\begin{table} #\begin{center} #\large #\begin{tabular}{clcl} \hline #{\em Variable} &{\em Form} &{\em Type} &{\em Description} \\ \hline #$\hat{C}$ &$p \times p$ square symetric matrix &real &between group variance\\ #$m$ &scalar &integer $>1$ &number of groups\\ #$h$ &scalar &real $>0$ &smoothing parameter\\ #$p$ &scalar &integer $>1$ &number of variables\\ #$A$ &$p\times p$ square symetric matrix &real &sum of inverses of within groups variances\\ # & & &of means of control and recovered data\\ #$\bar{y}_{1}$ &vector of length $p$ &real &mean of control measurements\\ #$\bar{y}_{2}$ &vector of length $p$ &real &mean of recovered measurements\\ #$D_{1}$ &$p \times p$ square symetric matrix &real &estimate of within groups variance\\ #$D_{2}$ &$p \times p$ square symetric matrix &real &estimate of within groups variance\\ #$y^{*}$ &vector of length $p$ &real &weighted mean of control and\\ & & &recovered measurements\\ #$\bar{x}_{i}$ &vector of length $p$ &real &means of measurements for $m$ groups in dataset\\ #$\hat{U}$ &$p\times p$ square symetric matrix &real &estimate of within groups varience\\ #$n_{c}$ &scalar &integer $>1$ &number of replicates for control item\\ #$n_{r}$ &scalar &integer $>1$ &number of replicates for recovered item\\ #$S^{*}$ &$p \times p$ square symetric matrix &real &between group sum of squares\\ #$x_{ij}$ &vector of length $p$ &real &$p$ measurements on $j^{\mathrm{th}}$ member of $i^{\mathrm{th}}$ group\\ #$S_{w}$ &$p \times p$ square symetric matrix &real &within group sum of squares\\ \hline #\end{tabular} #\end{center} #\end{table} #\begin{table} #\begin{center} #\large #\begin{tabular}{ccc} #\hline #{\em Formula component} & &{\em Code component}\\ \hline #&\\ #$\sqrt{\mid \hat{C} \mid}$ &= &top1\\ #&\\ #$\displaystyle \frac{1}{(m\hat{h}^p)}$ &= &top2\\ #&\\ #$\displaystyle \frac{1}{\sqrt{\mid A + (h^2 \hat{C})^{-1} \mid}}$ &= &top3\\ #&\\ #$\displaystyle e^{-\frac{1}{2}(\bar{y}_1 - \bar{y}_2)^T (D_1 + D_2)^{-1}(\bar{y}_1 - \bar{y}_2)}$ &= &top4\\ #&\\ #$\displaystyle \sum_{i=1}^{m} e^{-\frac{1}{2}(y^* - \bar{x}_i)^T (A^{-1} + (\hat{h}^2 \hat{C}))^{-1} ( y^* - \bar{x}_i)}$ &= &top5\\ #$\displaystyle \frac{1}{\sqrt{\mid D_l^{-1} + (\hat{h}^2 \hat{C})^{-1}\mid}}$ &= &bot1 and bot3\\ #&\\ #$\displaystyle \sum_{i=1}^{m} e^{-\frac{1}{2}(\bar{y}_l - \bar{x}_i)^T (D_l + \hat{h}^2 \hat{C})^{-1} ( \bar{y}_l - \bar{x}_i)}$ &= &bot2 and bot4\\ &\\ \hline #\end{tabular} #\end{center} #\end{table} #\end{document} ################################################################################################################### ######## END OF DOCUMENTATION ##################################################################################### ################################################################################################################### ######################################################################### # calculates the likelihood ratio for the suspect and recovered # fragments comming from the same source # # U - is the inter-item varibility and is an nxn matrix # where n is an integer of the number of measurements # # S - is the between group varibility and is an nxn matrix # where n is an integer of the number of measurements # # recovered.minus.suspect - is a column matrix of n differences in # means between recovered and suspect items # where n is an integer of the number # of measurements # # N.recovered - is an integer of the number of replicate measurements # for the recovered item # # N.suspect - is an integer of the number of replicate measurements # for the suspect item ######################################################################### MVRA.normal <- function(U, C, recovered.minus.suspect, N.recovered, N.suspect) { D1 <- U / N.recovered D2 <- U / N.suspect F <- solve(D1) + solve(D2) transpose.r.minus.s <- t(recovered.minus.suspect) numerator1 <- (1 / sqrt(det(D1))) * (1 / sqrt(det(D2))) * (1 / sqrt(det(C))) * (1 / sqrt(prod(eigen(F + solve(C))$values))) * (sqrt(det(solve(F) + C))) numerator2 <- exp(-0.5 * (transpose.r.minus.s %*% solve(D1 + D2) %*% recovered.minus.suspect)) denomonator <- (1 / sqrt(det(D1 + D2 + (2 * C)))) * exp(- 0.5 * (transpose.r.minus.s %*% solve(D1 + D2 + (2 * C)) %*% recovered.minus.suspect)) LR <- (numerator1 * numerator2) / denomonator return(LR) } # try using prod(eigen(F + ginv(C))$values) on all the determinate bits