Bruvo.distance <- function(genotype1, genotype2, maxl=9, usatnt=2, missing=-9) { require(combinat) # for permn if(identical(genotype1, genotype2)) {dist <- 0} else { if((length(genotype1)>maxl & length(genotype2)>maxl)| genotype1[1]==missing|genotype2[1]==missing){ dist <- NA} else { if(length(genotype1) >= length(genotype2)) { genotypeL <- genotype1/usatnt; genotypeS <- genotype2/usatnt} else { genotypeL <- genotype2/usatnt; genotypeS <- genotype1/usatnt} # if genotypes are identical, just return a distance of zero without doing the rest # of the calculation # if genotypes are both longer than 9, skip this calculation because it will take hours # whichever genotype has more alleles, make this genotypeL (long) and the other # genotypeS (short) # convert alleles into repeat counts by dividing by usatnt kl <- length(genotypeL) # sets the ploidy level for this genotype comparison ks <- length(genotypeS) # number of alleles in the shorter genotype allele.distances <- array(0 , c(kl,ks)) # Create an empty matrix to contain the raw distances between alleles for(n in 1:kl) { for(m in 1:ks) {allele.distances[n,m] <- genotypeL[n] - genotypeS[m]}} # fills the array with the differences in allele repeat count geometric.distances <- array(1 - 2^-abs(allele.distances) , c(kl,ks)) # geometric transformation based on mutation probabilities #Next, find the minimum distance sum among all permutations column <- 1:ks # an index of all columns (genotypeS alleles) row <- 1:kl # an index of all rows (genotypeL alleles) combinations <- combn(row, ks, FUN = NULL, simplify=FALSE) # all combinations of alleles in genotypeL that can be matched to non-virtual # alleles in genotypeS permutations <- permn(ks) # all possible orders that alleles within these combinations can go in mindist <- Inf # this variable will store the minimum sum encountered so far. for(i in 1:length(combinations)) { # the loop to go through every possible sum of compatible allele comparisons rowcomb <- combinations[[i]] # choose one combination of rows for this round for(l in 1:length(permutations)){ # go through all orders of this combinations of rows sum <- 0 # this is si, the sum of allele comparisons for(j in 1:ks){ sum <- sum + geometric.distances[rowcomb[permutations[[l]][j]],column[j]]} # the loop to calculate the sum for this permutation if(sum < mindist) {mindist <- sum} # is this the minimum sum found so far? }} dist <- (mindist+kl-ks)/kl # add 1 for each infinite virtual allele, then divide by the ploidy }} dist } read.GeneMapper<-function(infiles, missing=-9){ samples<-c() loci<-c() locusdata<-list() length(locusdata)<-length(infiles) for(i in 1:length(infiles)){ locusdata[[i]]<-read.table(infiles[i],sep="\t",header=TRUE, as.is=c("Sample.Name","Marker")) samples<-c(samples,locusdata[[i]][["Sample.Name"]]) loci<-c(loci,locusdata[[i]][["Marker"]]) } samples<-unique(samples) loci<-unique(loci) genotypedata<-array(list(missing),c(length(samples), length(loci)), list(samples,loci)) for(m in 1:length(locusdata)){ alleleindex<-grep("Allele",names(locusdata[[m]]),value=FALSE) for(j in 1:length(locusdata[[m]][["Sample.Name"]])){ untrimmedgenotype<-locusdata[[m]][j,alleleindex] #Gives an error if there are loci or samples not in the arguments genotypedata[[locusdata[[m]][["Sample.Name"]][j], locusdata[[m]][["Marker"]][j]]]<- untrimmedgenotype[!is.na(untrimmedgenotype)] } } genotypedata } distance.matrix.1locus<-function(gendata, distmetric=Bruvo.distance, progress=TRUE, ...){ count<-length(gendata) # how many genotypes there are distances<-matrix(nrow=count, ncol=count, dimnames=list(names(gendata),names(gendata))) for(m in 1:count){for(n in m:count){ thisdistance<-distmetric(gendata[[m]],gendata[[n]], ...) distances[m,n]<-thisdistance distances[n,m]<-thisdistance if(progress){print(c(names(gendata)[[m]],names(gendata)[[n]]))} }} distances } mean.distance.matrix <- function(gendata, samples=dimnames(gendata)[[1]], loci=dimnames(gendata)[[2]], all.distances=FALSE, usatnts=NULL, ...){ # create an array containing all distances by locus and sample loci.matrices<-array(dim=c(length(loci),length(samples),length(samples)), dimnames=list(loci,samples,samples)) for(i in loci){ if(is.null(usatnts)){ loci.matrices[i,,]<-distance.matrix.1locus(gendata[samples,i],...) } else { loci.matrices[i,,]<-distance.matrix.1locus(gendata[samples,i], usatnt=usatnts[i],...) } } # calculate the mean matrix across all loci mean.matrix<-matrix(nrow=length(samples),ncol=length(samples), dimnames=list(samples,samples)) for(j in samples){ for(k in samples){ mean.matrix[j,k]<-mean(loci.matrices[,j,k][!is.na(loci.matrices[,j,k])]) } } # return either the mean matrix and possibly the array as well if(all.distances){ list(loci.matrices, mean.matrix) } else { mean.matrix } } write.Structure <- function(gendata, ploidy, outfile, samples=dimnames(gendata)[[1]], loci=dimnames(gendata)[[2]], indploidies=rep(ploidy,times=length(samples)), extracols=NULL, missing=-9){ if(is.null(names(indploidies))) {names(indploidies)<-samples} structdata <- data.frame(rowlabel=c("missing",rep(samples, each=ploidy))) thiscol = 1 # which column we are working with # Fill in extra columns. Ignored if extracols=NULL. for(excol in dimnames(extracols)[[2]]){ thiscol <- thiscol+1 structdata[[thiscol]]<-c(0,rep(extracols[samples, excol], each=ploidy)) names(structdata)[thiscol]<-excol } # Fill in the genotypes for(locus in loci){ thiscol<-thiscol+1 alleles<-missing #set up the vector of alleles for(s in samples){ #If missing data must be inserted to reflect a lower ploidy level: if(indploidies[s] < ploidy){ if(length(gendata[[s,locus]]) == indploidies[s]){ #fully heterozygous genotype thesealleles <- c(gendata[[s,locus]], rep(missing, times=ploidy-indploidies[s])) } else { if(length(gendata[[s,locus]]) < indploidies[s]){ #duplicate the first allele to get to the right ploidy thesealleles <- c(gendata[[s,locus]], rep(gendata[[s,locus]][1], times=indploidies[s]- length(gendata[[s,locus]])), rep(missing, times=ploidy-indploidies[s])) } else { #randomly choose alleles to use if there are too many thesealleles <- c(sample(gendata [[s,locus]],indploidies[s],replace=FALSE), rep(missing, times=ploidy-indploidies[s])) } } #If the individual has equal or greater ploidy to that #being used in the file: } else { if(length(gendata[[s,locus]]) == ploidy){ #genotype fills available spaces thesealleles <- gendata[[s,locus]] } else { if(length(gendata[[s,locus]]) < ploidy){ #duplicate the first allele to get to the right ploidy thesealleles<-c(gendata[[s,locus]], rep(gendata[[s,locus]][1], times=ploidy-length(gendata[[s,locus]]))) } else { #randomly choose alleles to use if there are too many thesealleles<-sample(gendata[[s,locus]],ploidy,replace=FALSE) } } } alleles<-c(alleles,thesealleles) } structdata[[thiscol]]<-alleles names(structdata)[thiscol]<-locus } write.table(structdata, file=outfile, sep="\t", row.names=FALSE, quote=FALSE, col.names=TRUE) } estimate.ploidy <- function(gendata, samples=dimnames(gendata)[[1]], loci=dimnames(gendata)[[2]]){ # set up array to contain the maximum and average number of alleles ploidyinfo <- array(dim=c(length(samples),2), dimnames=list(samples, c("max.alleles","mean.alleles"))) # fill the array for(s in samples){ numalleles<-mapply(length,gendata[s,loci]) ploidyinfo[s,"max.alleles"] <- max(numalleles) ploidyinfo[s,"mean.alleles"] <- mean(numalleles) } ploidyinfo }