Bruvo.distance <- function(genotype1, genotype2, maxl=9, usatnt=2, missing=-9) { require(combinat) # for permn if(identical(genotype1, genotype2)) { dist <- 0 } else { if((length(genotype1)>maxl & length(genotype2)>maxl)| genotype1[1]==missing|genotype2[1]==missing){ dist <- NA } else { if(length(genotype1) >= length(genotype2)) { genotypeL <- genotype1/usatnt; genotypeS <- genotype2/usatnt } else { genotypeL <- genotype2/usatnt; genotypeS <- genotype1/usatnt } # if genotypes are identical, just return a distance of zero without doing the rest # of the calculation # if genotypes are both longer than 9, skip this calculation because it will take hours # whichever genotype has more alleles, make this genotypeL (long) and the other # genotypeS (short) # convert alleles into repeat counts by dividing by usatnt kl <- length(genotypeL) # sets the ploidy level for this genotype comparison ks <- length(genotypeS) # number of alleles in the shorter genotype allele.distances <- array(0 , c(kl,ks)) # Create an empty matrix to contain the raw distances between alleles for(n in 1:kl) { for(m in 1:ks) {allele.distances[n,m] <- genotypeL[n] - genotypeS[m]}} # fills the array with the differences in allele repeat count geometric.distances <- array(1 - 2^-abs(allele.distances) , c(kl,ks)) # geometric transformation based on mutation probabilities #Next, find the minimum distance sum among all permutations column <- 1:ks # an index of all columns (genotypeS alleles) row <- 1:kl # an index of all rows (genotypeL alleles) combinations <- combn(row, ks, FUN = NULL, simplify=FALSE) # all combinations of alleles in genotypeL that can be matched to non-virtual # alleles in genotypeS permutations <- permn(ks) # all possible orders that alleles within these combinations can go in mindist <- Inf # this variable will store the minimum sum encountered so far. for(i in 1:length(combinations)) { # the loop to go through every possible sum of compatible allele comparisons rowcomb <- combinations[[i]] # choose one combination of rows for this round for(l in 1:length(permutations)){ # go through all orders of this combinations of rows sum <- 0 # this is si, the sum of allele comparisons # the loop to calculate the sum for this permutation for(j in 1:ks){ sum <- sum + geometric.distances[rowcomb[permutations[[l]][j]],column[j]] } if(sum < mindist) {mindist <- sum} # is this the minimum sum found so far? } } dist <- (mindist+kl-ks)/kl # add 1 for each infinite virtual allele, then divide by the ploidy }} return(dist) } read.GeneMapper<-function(infiles, missing=-9){ samples<-c() loci<-c() locusdata<-list() length(locusdata)<-length(infiles) for(i in 1:length(infiles)){ locusdata[[i]]<-read.table(infiles[i],sep="\t",header=TRUE, as.is=c("Sample.Name","Marker")) samples<-c(samples,locusdata[[i]][["Sample.Name"]]) loci<-c(loci,locusdata[[i]][["Marker"]]) } samples<-unique(samples) loci<-unique(loci) genotypedata<-array(list(missing),c(length(samples), length(loci)), list(samples,loci)) for(m in 1:length(locusdata)){ alleleindex<-grep("Allele",names(locusdata[[m]]),value=FALSE) for(j in 1:length(locusdata[[m]][["Sample.Name"]])){ untrimmedgenotype<-locusdata[[m]][j,alleleindex] #Gives an error if there are loci or samples not in the arguments genotypedata[[locusdata[[m]][["Sample.Name"]][j], locusdata[[m]][["Marker"]][j]]]<- untrimmedgenotype[!is.na(untrimmedgenotype)] } } return(genotypedata) } distance.matrix.1locus<-function(gendata, distmetric=Bruvo.distance, progress=TRUE, ...){ count<-length(gendata) # how many genotypes there are distances<-matrix(nrow=count, ncol=count, dimnames=list(names(gendata),names(gendata))) for(m in 1:count){for(n in m:count){ thisdistance<-distmetric(gendata[[m]],gendata[[n]], ...) distances[m,n]<-thisdistance distances[n,m]<-thisdistance if(progress){print(c(names(gendata)[[m]],names(gendata)[[n]]))} }} return(distances) } meandistance.matrix <- function(gendata, samples=dimnames(gendata)[[1]], loci=dimnames(gendata)[[2]], all.distances=FALSE, usatnts=NULL, ...){ # create an array containing all distances by locus and sample loci.matrices<-array(dim=c(length(loci),length(samples),length(samples)), dimnames=list(loci,samples,samples)) for(i in loci){ if(is.null(usatnts)){ loci.matrices[i,,]<-distance.matrix.1locus(gendata[samples,i],...) } else { loci.matrices[i,,]<-distance.matrix.1locus(gendata[samples,i], usatnt=usatnts[i],...) } } # calculate the mean matrix across all loci mean.matrix<-matrix(nrow=length(samples),ncol=length(samples), dimnames=list(samples,samples)) for(j in samples){ for(k in samples){ mean.matrix[j,k]<-mean(loci.matrices[,j,k][!is.na(loci.matrices[,j,k])]) } } # return either the mean matrix and possibly the array as well if(all.distances){ return(list(DistByLoc=loci.matrices, MeanMatrix=mean.matrix)) } else { return(mean.matrix) } } write.Structure <- function(gendata, ploidy, file="", samples=dimnames(gendata)[[1]], loci=dimnames(gendata)[[2]], indploidies=rep(ploidy,times=length(samples)), extracols=NULL, missing=-9){ if(is.null(names(indploidies))) {names(indploidies)<-samples} structdata <- data.frame(rowlabel=c("missing",rep(samples, each=ploidy))) thiscol = 1 # which column we are working with # Fill in extra columns. Ignored if extracols=NULL. for(excol in dimnames(extracols)[[2]]){ thiscol <- thiscol+1 structdata[[thiscol]]<-c(0,rep(extracols[samples, excol], each=ploidy)) names(structdata)[thiscol]<-excol } # Fill in the genotypes for(locus in loci){ thiscol<-thiscol+1 alleles<-missing #set up the vector of alleles for(s in samples){ #If missing data must be inserted to reflect a lower ploidy level: if(indploidies[s] < ploidy){ if(length(gendata[[s,locus]]) == indploidies[s]){ #fully heterozygous genotype thesealleles <- c(gendata[[s,locus]], rep(missing, times=ploidy-indploidies[s])) } else { if(length(gendata[[s,locus]]) < indploidies[s]){ #duplicate the first allele to get to the right ploidy thesealleles <- c(gendata[[s,locus]], rep(gendata[[s,locus]][1], times=indploidies[s]- length(gendata[[s,locus]])), rep(missing, times=ploidy-indploidies[s])) } else { #randomly choose alleles to use if there are too many thesealleles <- c(sample(gendata [[s,locus]],indploidies[s],replace=FALSE), rep(missing, times=ploidy-indploidies[s])) } } #If the individual has equal or greater ploidy to that #being used in the file: } else { if(length(gendata[[s,locus]]) == ploidy){ #genotype fills available spaces thesealleles <- gendata[[s,locus]] } else { if(length(gendata[[s,locus]]) < ploidy){ #duplicate the first allele to get to the right ploidy thesealleles<-c(gendata[[s,locus]], rep(gendata[[s,locus]][1], times=ploidy-length(gendata[[s,locus]]))) } else { #randomly choose alleles to use if there are too many thesealleles<-sample(gendata[[s,locus]],ploidy,replace=FALSE) } } } alleles<-c(alleles,thesealleles) } structdata[[thiscol]]<-alleles names(structdata)[thiscol]<-locus } write.table(structdata, file=file, sep="\t", row.names=FALSE, quote=FALSE, col.names=TRUE) } estimate.ploidy <- function(gendata, samples=dimnames(gendata)[[1]], loci=dimnames(gendata)[[2]]){ # set up array to contain the maximum and average number of alleles ploidyinfo <- array(dim=c(length(samples),2), dimnames=list(samples, c("max.alleles","mean.alleles"))) # fill the array for(s in samples){ numalleles<-mapply(length,gendata[s,loci]) ploidyinfo[s,"max.alleles"] <- max(numalleles) ploidyinfo[s,"mean.alleles"] <- mean(numalleles) } return(ploidyinfo) } read.Tetrasat <- function(infile,missing=-9){ #read the file into a character vector, containing all the lines rawdata<-readLines(infile) #find which lines delimit populations popindex<-grep("pop",rawdata,ignore.case=TRUE,value=FALSE) #get a character vector of loci, whether they were stored in one or #several lines loci<-rawdata[2:(popindex[1]-1)] if(length(loci) == 1){loci<-strsplit(loci,",")[[1]]} #find which lines contain genotype data samindex<-c(popindex[1]:length(rawdata)) samindex<-samindex[!samindex %in% popindex] #make a two-dimensional list to hold the genotypes gendata<-array(list(missing),dim=c(length(samindex),length(loci)), dimnames=list(c(1:length(samindex)),loci)) #make a vector containing population data popdata<-c() length(popdata)<-length(samindex) #Extract the data out of the lines for(i in 1:length(samindex)){ #Find the largest value in popindex that is smaller than samindex[i] #Use the position of that value in popindex for the pop id popdata[i]<-match(max(popindex[popindex < samindex[i]]),popindex) #extract sample names and genotypes samname<-gsub(" ","",substring(rawdata[samindex[i]],1,20),fixed=TRUE) names(popdata)[i]<-samname dimnames(gendata)[[1]][i]<-samname thesegenotypes<-substring(rawdata[samindex[i]], seq(length=length(loci),from=21,by=9), seq(length=length(loci),from=28,by=9)) for(j in 1:length(loci)){ thesealleles<-gsub(" ","",thesegenotypes[j],fixed=TRUE) if(nchar(thesealleles) != 0){ thesealleles<-substring(thesealleles, seq(length=nchar(thesealleles)/2,from=1,by=2), seq(length=nchar(thesealleles)/2,from=2,by=2)) thesealleles<-unique(as.integer(thesealleles)) } else {thesealleles<-missing} gendata[[i,j]]<-thesealleles } } #return genotype and pop data return(list(PopData=popdata,Genotypes=gendata)) } read.ATetra<-function(infile){ #read the file into a list of vectors, one for each line rawdata<-strsplit(readLines(infile),",") #make an index of which lines contain locus and sample info #also make an index of all loci locindex<-c() samindex<-c() for(i in 1:length(rawdata)){ if(rawdata[[i]][1] == "LOC"){ locindex<-c(locindex,i) names(locindex)[length(locindex)]<-rawdata[[i]][3] } if(rawdata[[i]][1] == "IND"){ samindex<-c(samindex,i) } } #make a vector of all loci loci<-names(locindex) #make a vector of all samples, and a vector of which samples go in which pops samples<-c() popdata<-c() samindex1loc<-samindex[samindex < locindex[2]] for(j in 1:length(samindex1loc)){ samples[j]<-rawdata[[samindex1loc[j]]][5] popdata[j]<-as.integer(rawdata[[samindex1loc[j]]][3]) } names(popdata)<-samples #set up the array to contain genotypes gendata<-array(list(missing),dim=c(length(samples),length(loci)), dimnames=list(samples,loci)) #fill the array of genotypes for(m in samindex){ thesealleles<-rawdata[[m]][6:9] thesealleles<-as.integer(thesealleles[thesealleles !=""]) thesealleles<-thesealleles[!is.na(thesealleles)] gendata[[as.integer(rawdata[[m]][4]),as.integer(rawdata[[m]][2])]]<-thesealleles } #return population data and genotypes return(list(PopData=popdata,Genotypes=gendata)) } dominant.to.codominant<-function(domdata,colinfo=NULL,samples=dimnames(domdata)[[1]], missing=-9,allelepresent=1,split="."){ #get locus and allele information if(is.null(colinfo)){ rawcol<-strsplit(dimnames(domdata)[[2]],split,fixed=TRUE) loc<-c() alleles<-c() for(i in 1:length(rawcol)){ loc[i]<-rawcol[[i]][1] alleles[i]<-as.integer(rawcol[[i]][2]) } colinfo<-data.frame(Loci=loc, Alleles=alleles) } #extract a list of loci loci<-unique(colinfo[[1]]) #set up the genotype list gendata<-array(list(missing),dim=c(length(samples),length(loci)), dimnames=list(samples,loci)) #fill the genotype vectors for(n in 1:dim(domdata)[2]){ for(m in samples){ if(domdata[m,n] == allelepresent){ gendata[[m,colinfo[n,1]]]<-c(gendata[[m,colinfo[n,1]]],colinfo[n,2]) gendata[[m,colinfo[n,1]]]<- gendata[[m,colinfo[n,1]]][gendata[[m,colinfo[n,1]]] != missing] } } } #return codominant genotypes return(gendata) } codominant.to.dominant<-function(gendata, makecolinfo=FALSE, allelepresent=1, alleleabsent=0, missingin=-9, missingout=-9, loci=dimnames(gendata)[[2]], samples=dimnames(gendata)[[1]]){ # Find all unique alleles for each locus locvector<-c() allelevector<-c() for(l in loci){ thesealleles<-c() for(s in samples){ thesealleles<-c(thesealleles,gendata[[s,l]]) } thesealleles<-unique(thesealleles) thesealleles<-thesealleles[thesealleles != missingin] thesealleles<-sort(thesealleles) locvector<-c(locvector,rep(l,length(thesealleles))) allelevector<-c(allelevector,thesealleles) } # Build data frame of locus and allele information. colinfo<-data.frame(Loci=locvector,Alleles=allelevector) # Set up matrix of dominant data domdata<-matrix(nrow=length(samples), ncol=dim(colinfo)[1], dimnames=list(samples,1:dim(colinfo)[1])) # Search for alleles and fill matrix with presence and absence symbols for(m in 1:dim(colinfo)[1]){ for(s in samples){ if(gendata[[s,colinfo[m,1]]][1] == missingin){ domdata[s,m]<-missingout } else { if(colinfo[[m,2]] %in% gendata[[s,colinfo[m,1]]]){ domdata[s,m]<-allelepresent } else { domdata[s,m]<-alleleabsent } } } # Create the locus.allele name for the column dimnames(domdata)[[2]][m]<-paste(colinfo[m,1],colinfo[m,2], sep=".") } if(makecolinfo) return(list(Domdata=domdata, Colinfo=colinfo)) else return(domdata) } read.GenoDive <- function(infile,missing=-9){ rawdata<-readLines(infile) #information about number of samples, number of loci, etc. is in second line datainfo<-as.integer(strsplit(rawdata[[2]],"\t")[[1]]) numsam<-datainfo[1] # number of individuals numpop<-datainfo[2] # number of populations numloc<-datainfo[3] # number of loci digits<-datainfo[5] # number of digits used to code each allele # get the data header with column names, including loci colheader<-strsplit(rawdata[[3+numpop]],"\t")[[1]] lastloc<-length(colheader) # index of the last locus column firstloc<-lastloc-numloc+1 # index of the first locus column (3 or 4) loci<-colheader[firstloc:lastloc] # get the locus names # set up the list to contain the genotype data and the vector to contain the # population info gendata<-array(list(missing),dim=c(numsam,numloc),dimnames=list(1:numsam,loci)) popinfo<-rep(0,times=numsam) names(popinfo)<-1:numsam # fill the list and the vector! for(s in 1:numsam){ # get data from the row for this sample sampledata<-strsplit(rawdata[[3+numpop+s]],"\t")[[1]] # extract name and population popinfo[s]<-as.integer(sampledata[1]) names(popinfo)[s]<-sampledata[2] dimnames(gendata)[[1]][s]<-sampledata[2] # extract alleles for(L in 1:numloc){ rawalleles<-sampledata[(firstloc:lastloc)[L]] while(nchar(rawalleles)%%digits !=0) { # If leading zeros were removed rawalleles<-paste("0",rawalleles,sep="") } # convert the character string to an integer vector myfirst<-seq(length=nchar(rawalleles)/digits,from=1,by=digits) mylast<-seq(length=nchar(rawalleles)/digits,from=digits,by=digits) thesealleles<-as.integer(substring(rawalleles,myfirst,mylast)) # get rid of duplicate alleles thesealleles<-unique(thesealleles) # insert the missing data symbol as appropriate if(length(thesealleles)==1 && thesealleles==0){thesealleles<-missing} # get rid of "missing alleles" if the whole genotype is not missing thesealleles<-thesealleles[thesealleles !=0] # write the allele vector to the list gendata[[s,L]]<-thesealleles } } # return the vector and list return(list(PopData=popinfo,Genotypes=gendata)) } write.GenoDive<-function(gendata, popnames="onebigpop", commentline="file description goes here", digits=2, file="", samples=dimnames(gendata)[[1]], loci=dimnames(gendata)[[2]], popinfo=rep(1,times=length(samples)), usatnts=rep(2,times=length(loci)), missing=-9){ # fill in names if popinfo and usatnts are default if(is.null(names(popinfo))){names(popinfo)<-samples} if(is.null(names(usatnts))){names(usatnts)<-loci} # get some info for the second line of the file numsam<-length(samples) numloc<-length(loci) numpop<-length(popnames) if(numpop != length(unique(popinfo[samples]))){ cat("Warning: number of populations in popnames and popinfo is different.\n") } maxploidy<-max(estimate.ploidy(gendata[samples,loci])[,"max.alleles"]) # start a character vector to contain all the lines of the file lines<-c(commentline,paste(numsam,numpop,numloc,maxploidy,digits, sep="\t")) lines[3:(numpop+2)]<-popnames lines[numpop+3]<-paste("Population\tIndividual",paste(loci,sep="",collapse="\t"),sep="\t") # enter population and sample names for(s in 1:numsam){ lines[numpop+3+s]<-paste(popinfo[samples[s]], samples[s], sep="\t") } # process alleles and write them to lines for(L in loci){ # replace missing data with zeros repmiss<-function(value){ if(value[1]==missing){ 0 } else {value} } convertedalleles<-mapply(repmiss, gendata[samples,L],SIMPLIFY=FALSE) # convert alleles to repeat number divallele<-function(value){floor(value/usatnts[L])} convertedalleles<-mapply(divallele,convertedalleles,SIMPLIFY=FALSE) # subtract if necessary to get them to the right number of digits suballele<-function(value){if(value[1]!=0){ value-(10^(digits-1))} else{ 0}} while(max(mapply(max,convertedalleles)) >= 10^digits){ convertedalleles<-mapply(suballele, convertedalleles, SIMPLIFY=FALSE) } # For each sample, concatenate into strings and add to lines for(s in 1:numsam){ # Convert alleles to character and set up string for concatenation charalleles<-as.character(convertedalleles[[s]]) allelestring<-"" # For each allele: for(ca in charalleles){ allele<-ca # Add zeros if necessary while(nchar(allele) < digits){ allele<-paste("0",allele,sep="") } # add this allele to the string allelestring<-paste(allelestring,allele,sep="") } # Add the allele string to the line for that sample. lines[numpop+3+s]<-paste(lines[numpop+3+s],allelestring,sep="\t") } } # write the file cat(lines, file=file, sep="\n") } write.Tetrasat<-function(gendata, commentline="insert data description here", samples=dimnames(gendata)[[1]], loci=dimnames(gendata)[[2]], popinfo=rep(1, length(samples)), usatnts=rep(2, length(loci)), file="", missing=-9){ # index popinfo and usatnts if not already done if(is.null(names(popinfo))){names(popinfo)<-samples} if(is.null(names(usatnts))){names(usatnts)<-loci} allpops<-unique(popinfo) # a vector of populations to cycle through # set up vector of lines to write to file lines<-c(commentline,loci) # where does population/genotype data begin? datastart<-length(lines) + 1 # make lines with "Pop" and sample names, and an index of where samples are sampleindex<-as.integer(c()) currentline<-datastart for(pop in allpops){ lines[currentline]<-"Pop" currentline<-currentline+1 # make a vector of all samples that belong to this population thesesamples<-samples[popinfo[samples]==pop] # make an index of the lines that they will go on indextoadd<-currentline:(currentline+length(thesesamples)-1) names(indextoadd)<-thesesamples # add this to the sample index sampleindex<-c(sampleindex,indextoadd) # write each sample name (plus spaces up to 20 characters) to the # appropriate line for(s in thesesamples){ samname<-s while(nchar(samname)<20){ samname<-paste(samname," ", sep="") } lines[currentline]<-samname currentline<-currentline+1 } } # now go through the loci, convert the genotypes, and fill them in for(L in loci){ # convert alleles to repeat number divallele<-function(value){ if(value[1]==missing){ value } else { floor(value/usatnts[L]) } } convertedalleles<-mapply(divallele, gendata[samples,L],SIMPLIFY=FALSE) # make sure alleles have two digits or fewer suballele<-function(value){ if(value[1]==missing){ value } else { value-10 } } while(max(mapply(max,convertedalleles)) >= 100){ convertedalleles<-mapply(suballele, convertedalleles, SIMPLIFY=FALSE) } # go through the genotypes by sample for(s in samples){ # convert missing data to blank spaces, convert to alleles to characters if(convertedalleles[[s]][1]==missing){ charalleles<-" " } else { charalleles<-as.character(convertedalleles[[s]]) # duplicate allele if fully homozygous if(length(charalleles)==1){ charalleles<-rep(charalleles,4) } # randomly remove alleles if there are more than 4 if(length(charalleles)>4){ charalleles<-sample(charalleles,4,replace=FALSE) cat(c("Alleles randomly removed:",s,L,"\n"),sep=" ") } } # concatenate all alleles into one string allelestring<-"" for(ca in charalleles){ allele<-ca # Add zeros if necessary while(nchar(allele) < 2){ allele<-paste("0",allele,sep="") } # add this allele to the string allelestring<-paste(allelestring,allele,sep="") } # add spaces up to nine characters while(nchar(allelestring) < 9){ allelestring<-paste(allelestring," ", sep="") } # add the concatenated genotype to the appropriate line lines[sampleindex[s]]<-paste(lines[sampleindex[s]],allelestring,sep="") } } cat(lines, sep="\n", file=file) } write.ATetra<-function(gendata, samples=dimnames(gendata)[[1]], loci=dimnames(gendata)[[2]], popinfo=rep(1,length(samples)), popnames="onebigpop", commentline="insert data info here", missing=-9, file=""){ # name popinfo if necessary if(is.null(names(popinfo))){names(popinfo)<-samples} # set up a character vector to hold the lines for the file lines<-paste("TIT",commentline, sep=",") currentline<-2 # a variable to say which line to write to next # make numbers to go with loci, pops, and samples locnums<-1:length(loci) names(locnums)<-loci samnums<-1:length(samples) names(samnums)<-samples popnums<-1:length(popnames) names(popnums)<-popnames if(length(popnames) != length(unique(popinfo))){ cat("Warning: number of populations in popnames and popinfo is different.\n") } # fill in data using a loop for(L in loci){ # write a line describing the locus lines[currentline]<-paste("LOC",locnums[L],L,sep=",") currentline<-currentline+1 for(p in popnames){ # write a line describing the population lines[currentline]<-paste("POP",locnums[L],popnums[p],p, sep=",") currentline<-currentline+1 # get a vector of individuals in this population thesesamples<-names(popinfo)[popinfo==popnums[p]] thesesamples<-thesesamples[thesesamples %in% samples] # write lines for individuals for(s in thesesamples){ # first put sample info into the line lines[currentline]<-paste("IND",locnums[L],popnums[p],samnums[s],s,sep=",") # get the alleles and print a warning if there is missing data thesealleles<-gendata[[s,L]] if(thesealleles[1]==missing){ thesealleles<-"" cat("Missing data:",s,L,"\n",sep=" ") } # take a random sample if there are more than 4 if(length(thesealleles)>4){ thesealleles<-sample(thesealleles,4,replace=FALSE) cat("More than 4 alleles:",s,L,"\n",sep=" ") } # add alleles to the line for(a in 1:4){ if(length(thesealleles)>=a ){ lines[currentline]<-paste(lines[currentline],thesealleles[a],sep=",") } else { lines[currentline]<-paste(lines[currentline],"",sep=",") } } currentline<-currentline+1 } } } # write the "END-record" lines[currentline]<-"END" # write the file cat(lines, sep="\n", file=file) } read.Structure<-function(infile,missingin=-9,missingout=-9,sep="\t",markernames=TRUE, labels=TRUE, extrarows=1, extracols=0, ploidy=4, getexcols=FALSE ){ # read the file rawdata<-read.table(infile,header=markernames,sep=sep) # get an index of samples and a column labeling the samples if(labels){ # if row labels are used, get those as the sample names samples<-unique(rawdata[[1]]) samples<-samples[!is.na(samples)] samples<-as.character(samples) samples<-samples[samples != ""] names(rawdata)[1]<-"Samples" } else { # make an integer vector to represent samples samples<-1:((dim(rawdata)[1]-extrarows)/ploidy) samindex<-c(rep(0,times=extrarows),rep(samples, each=ploidy)) rawdata[length(rawdata)+1]<-samindex names(rawdata)[length(rawdata)]<-"Samples" } # get an index of loci (will be V2 etc. if loci not named) loci<-names(rawdata) loci<-loci[loci != "Samples"] loci<-loci[(extracols+1):length(loci)] # set up the list to store the genotypes gendata<-array(list(missingout),dim=c(length(samples),length(loci)), dimnames=list(samples,loci)) # fill the list for(L in loci){ for(s in samples){ rawalleles<-rawdata[rawdata$Samples==s,L] # process missing data and get unique alleles thesealleles<-unique(rawalleles) if(length(thesealleles)==1 && thesealleles[1]==missingin){ thesealleles<-missingout } else { thesealleles<-thesealleles[thesealleles != missingin] } gendata[[s,L]]<-thesealleles } } # extract the extra columns, if needed if(getexcols){ Extracol<-data.frame(row.names=samples) if(labels){ colsbeforex<-1 } else { colsbeforex<-0 } for(x in extracols){ # look up values by sample in this column # add the column to the data frame Extracol[[x]]<-rawdata[[x+colsbeforex]][seq(1+extrarows, length(rawdata[[1]])+1-ploidy, by=ploidy)] } } # return extra columns and genotypes if(getexcols){ return(list(ExtraCol=Extracol, Genotypes=gendata)) } else { return(gendata) } } write.GeneMapper<-function(gendata,file="",samples=dimnames(gendata)[[1]], loci=dimnames(gendata)[[2]]){ # figure out how many allele columns are needed numallelecol<-max(estimate.ploidy(gendata[samples,loci])[,"max.alleles"]) # figure out how many rows are needed numrows<-length(samples)*length(loci) # set up data frame gentable<-data.frame(Sample.Name=rep(samples, times=length(loci)), Marker=rep(loci, each=length(samples))) # put empty allele columns into data frame alcollabels<-paste("Allele.",1:numallelecol,sep="") for(ac in 1:numallelecol){ gentable[[ac+2]]<-rep("",times=numrows) names(gentable)[ac+2]<-alcollabels[ac] } # put alleles into their respective cells currentrow<-1 for(L in loci){ for(s in samples){ thesealleles<-as.character(gendata[[s,L]]) gentable[currentrow,3:(length(thesealleles)+2)]<-thesealleles currentrow<-currentrow+1 } } # write the table to file write.table(gentable, file=file, sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) } Lynch.distance<-function(genotype1,genotype2,missing=-9){ if(genotype1[1]==missing || genotype2[1]==missing){ # return NA if there is any missing data distance<-NA } else { # get the average number of bands for the two genotypes meanbands<-(length(genotype1)+length(genotype2))/2 # find how many bands the genotypes have in common commonbands<-length(genotype1[genotype1 %in% genotype2]) # calculate the distance distance<- 1-(commonbands/meanbands) } # return distance return(distance) } meandist.from.array<-function(distarray, samples=dimnames(distarray)[[2]], loci=dimnames(distarray)[[1]]){ # get the array to be averaged subarray<-distarray[loci,samples,samples] # make a matrix to put the means into mean.matrix<-matrix(nrow=length(samples),ncol=length(samples), dimnames=list(samples,samples)) for(j in samples){ for(k in samples){ mean.matrix[j,k]<-mean(subarray[,j,k][!is.na(subarray[,j,k])]) } } return(mean.matrix) } find.na.dist<-function(distarray, samples=dimnames(distarray)[[2]], loci=dimnames(distarray)[[1]]){ # set up vectors for data frame to contain info on where missing data is Locus<-"" Sample1<-"" Sample2<-"" # current row in the data frame currrow<-1 # go through the array, find NA, and put the index into the data frame for(L in loci){ for(s1 in samples){ for(s2 in samples){ if(is.na(distarray[L,s1,s2])){ Locus[currrow]<-L Sample1[currrow]<-s1 Sample2[currrow]<-s2 currrow<-currrow+1 } } } } #return data frame return(data.frame(Locus=Locus, Sample1=Sample1, Sample2=Sample2, stringsAsFactors=FALSE)) } find.missing.gen<-function(gendata, missing=-9, samples=dimnames(gendata)[[1]], loci=dimnames(gendata)[[2]]){ # set up vectors to contain the indices to put into the data frame Locus <- c("") Sample <- c("") # current row in the data frame currrow<-1 # find which data are missing for(L in loci){ for(s in samples){ if(gendata[[s,L]][1]==missing){ Locus[currrow]<-L Sample[currrow]<-s currrow<-currrow+1 } } } # return the data frame return(data.frame(Locus=Locus,Sample=Sample, stringsAsFactors=FALSE)) } # find NA distances that aren't the result of missing data find.na.dist.not.missing<-function(gendata, distarray, missing=-9, samples=dimnames(distarray)[[2]], loci=dimnames(distarray)[[1]]){ # get the data frames of NA distances and missing genotypes na.dist<-find.na.dist(distarray,samples=samples,loci=loci) missing.gen<-find.missing.gen(gendata,missing=missing,samples=samples,loci=loci) # set up vectors for data frame to contain results Locus<-"" Sample1<-"" Sample2<-"" currrow<-1 # for each row of na.dist, look for that locus and samples in missing.gen for(i in 1:length(na.dist$Locus)){ L<-na.dist$Locus[i] s1<-na.dist$Sample1[i] s2<-na.dist$Sample2[i] missthislocus<-missing.gen[missing.gen$Locus==L,] if(identical(c(s1,s2) %in% missthislocus$Sample, c(FALSE,FALSE))){ Locus[currrow]<-L Sample1[currrow]<-s1 Sample2[currrow]<-s2 currrow<-currrow+1 } } # return data frame return(data.frame(Locus=Locus, Sample1=Sample1, Sample2=Sample2, stringsAsFactors=FALSE)) } estimate.freq<-function(gendata, missing=-9, samples=dimnames(gendata)[[1]], loci=dimnames(gendata)[[2]], popinfo=rep(1,length(samples)), indploidies=rep(4,length(samples))){ # name popinfo and indploidies if necessary if(is.null(names(popinfo))){ names(popinfo)<-samples } if(is.null(names(indploidies))){ names(indploidies)<-samples } # get presence/absence data for alleles, and the table of which allele and locus # is in which column gentable<-codominant.to.dominant(gendata[samples,loci], makecolinfo=TRUE, missingin=missing) colinfo<-gentable[[2]] gentable<-gentable[[1]] # get a list of populations pops<-sort(unique(popinfo[samples])) # get total number of genomes per population totgenomes<-rep(0,length(pops)) names(totgenomes)<-pops for(p in pops){ totgenomes[p]<-sum(indploidies[samples[popinfo[samples]==p]]) } # set up data frame to contain allele frequencies freqtable<-data.frame(Genomes=totgenomes,row.names=pops) # loop to get frequency data for(L in loci){ # get all samples without missing data at this locus xsamples<-samples[gentable[,match(L,colinfo[[1]])]!=-9] # get the total number of genomes per population totgenomes<-rep(0,length(pops)) names(totgenomes)<-pops for(p in pops){ totgenomes[p]<-sum(indploidies[xsamples[popinfo[xsamples]==p]]) } # make a conversion factor to weight allele presence based on ploidy # of each individual and number of alleles at this locus numalleles<-sapply(gendata[xsamples,L],length) names(numalleles)<-xsamples convf<-indploidies[xsamples]/numalleles # convert alleles in the table to estimated copy number loctable<-gentable[xsamples,colinfo[[1]]==L]*convf # loop through alleles at this locus for(al in colinfo[[2]][colinfo[[1]]==L]){ theseallelefreqs<-rep(0,length(pops)) names(theseallelefreqs)<-pops # loop through populations for(p in pops){ theseallelefreqs[p]<-sum(loctable[xsamples[popinfo[xsamples]==p], paste(L,al,sep=".")])/totgenomes[p] } freqtable<-cbind(freqtable,theseallelefreqs) } } # return data frame names(freqtable)<-c("Genomes",dimnames(gentable)[[2]]) return(freqtable) } calcFst<-function(freqs, pops=row.names(freqs), loci=unique(as.matrix(as.data.frame(strsplit(names(freqs), split=".", fixed=TRUE), stringsAsFactors=FALSE))[1,])){ # Clean up loci loci<-loci[loci!="Genomes"] # Set up matrix for Fst values fsts<-matrix(0,nrow=length(pops),ncol=length(pops),dimnames=list(pops,pops)) # Get genome number from the table genomes<-freqs$Genomes names(genomes)<-pops for(m in 1:length(pops)){ for(n in m:length(pops)){ # set up array for HT and HS values hets<-array(0,dim=c(length(loci),2),dimnames=list(loci,c("HT","HS"))) for(L in loci){ # get just the frequencies for these pops and this locus thesefreqs<-freqs[c(pops[m],pops[n]),grep(L,names(freqs),fixed=TRUE)] # get average allele frequencies weighted by genomes/pop avgfreq<-(thesefreqs[1,]*genomes[pops[m]] + thesefreqs[2,]*genomes[pops[n]])/ (genomes[pops[m]] + genomes[pops[n]]) # estimate H by 1 - sum of squared allele frequencies # put the heterozygositites in the array hets[L,"HT"]<-1-sum(avgfreq^2) hets[L,"HS"]<-((1-sum(thesefreqs[1,]^2))*genomes[pops[m]] + (1-sum(thesefreqs[2,]^2))*genomes[pops[n]])/ (genomes[pops[m]] + genomes[pops[n]]) } HT<-mean(hets[,"HT"]) HS<-mean(hets[,"HS"]) fsts[m,n]<-(HT-HS)/HT fsts[n,m]<-(HT-HS)/HT } } # return matrix of Fst values return(fsts) } read.SPAGeDi<-function(infile, allelesep="/", returncatspatcoord=FALSE, returnploidies=FALSE, missing=-9){ # get all lines from the file, and eliminate comment lines Lines<-readLines(infile) Lines<-Lines[Lines != ""] first2<-sapply(Lines, substring, first=1, last=2) Lines<-Lines[first2 != "//"] # get data from the first line fileinfo<-as.integer(strsplit(Lines[1], "\t")[[1]]) numind<-fileinfo[1] # number of samples numcat<-fileinfo[2] # number of categories numsc<-fileinfo[3] # number of spatial coordinates numloc<-fileinfo[4] # number of loci digits<-fileinfo[5] # number of digits to represent alleles # Is there a column of categories? if(numcat==0){ catpres<-0 } else { catpres<-1 } # Is latitude and longitude used instead of Cartesian coordinates? if(numsc==-2){ numsc<-2 } # read the rest of the file as a table cat(Lines[3:(3+numind)], sep="\n", file="SpagTemp.txt") gentable <- read.table("SpagTemp.txt", sep="\t", header=TRUE, row.names=1, colClasses=c("character",rep(NA,catpres+numsc), rep("character",numloc))) # get sample and locus names samples<-row.names(gentable) loci<-names(gentable)[(length(gentable)-numloc+1):length(gentable)] # set up list to contain genotypes gendata <- array(list(missing), dim=c(length(samples), length(loci)), dimnames=list(samples,loci)) # set up vector to contain ploidies indploidies <- rep(4, length(samples)) names(indploidies) <- samples # If there is no separation of alleles, count digits off with substring if(allelesep==""){ for(s in samples){ #set up a list to contain genotypes thesegenotypes<-list(0) length(thesegenotypes)<-length(loci) names(thesegenotypes)<-loci for(L in loci){ # add leading zeros if necessary while(nchar(gentable[s,L])%%digits !=0){ gentable[s,L]<-paste("0",gentable[s,L],sep="") } # split into alleles and convert to integers thesegenotypes[[L]]<-as.integer(substring(gentable[s,L], first=seq(1, nchar(gentable[s,L])-digits+1, by=digits), last=seq(digits, nchar(gentable[s,L]),by=digits))) # if genotype only has zeros, write missing data symbol if(length(unique(thesegenotypes[[L]]))==1 && thesegenotypes[[L]][1]==0){ thesegenotypes[[L]]<-missing } # otherwise remove zeros on left while(thesegenotypes[[L]][1]==0){ thesegenotypes[[L]]<-thesegenotypes[[L]][-1] } } # get ploidy of sample indploidies[s] <- max(sapply(thesegenotypes,length)) for(L in loci){ # remove zeros on the right thesegenotypes[[L]]<-thesegenotypes[[L]][thesegenotypes[[L]] != 0] # get unique alleles thesegenotypes[[L]]<-unique(thesegenotypes[[L]]) } # add genotypes to list gendata[s,]<-thesegenotypes } # get alleles by strsplit } else { for(s in samples){ # get alleles by splitting the strings thesegenotypes<-sapply(gentable[s,loci],strsplit, split=allelesep,fixed=TRUE) names(thesegenotypes) <- loci for(L in loci){ # convert to integer thesegenotypes[[L]]<-as.integer(thesegenotypes[[L]]) # if genotype only has zeros, write missing data symbol if(length(unique(thesegenotypes[[L]]))==1 && thesegenotypes[[L]][1]==0){ thesegenotypes[[L]]<-missing } # otherwise remove zeros on left while(thesegenotypes[[L]][1]==0){ thesegenotypes[[L]]<-thesegenotypes[[L]][-1] } } # get ploidy of sample indploidies[s] <- max(sapply(thesegenotypes,length)) for(L in loci){ # remove zeros on the right thesegenotypes[[L]]<-thesegenotypes[[L]][thesegenotypes[[L]] != 0] # get unique alleles thesegenotypes[[L]]<-unique(thesegenotypes[[L]]) } # add genotypes to list gendata[s,]<-thesegenotypes } } # return the genotypes, and other data as applicable if(identical(c(returncatspatcoord,returnploidies), c(FALSE,FALSE))){ return(gendata) } else { return(list(CatSpatCoord=gentable[,!names(gentable) %in% loci], Indploidies=indploidies, Genotypes=gendata)[c(returncatspatcoord, returnploidies,TRUE)]) } } write.SPAGeDi<-function(gendata,samples=dimnames(gendata)[[1]], loci=dimnames(gendata)[[2]], indploidies=rep(4,length(samples)), popinfo=rep(1,length(samples)), allelesep="/", digits=2, file="", spatcoord=data.frame(X=rep(1,length(samples)), Y=rep(1,length(samples)), row.names=samples), usatnts=rep(2, length(loci)), missing=-9){ # name indploidies, popinfo, usatnts, and if not already done if(is.null(names(indploidies))){names(indploidies)<-samples} if(is.null(names(popinfo))){names(popinfo)<-samples} if(is.null(names(usatnts))){names(usatnts)<-loci} if(identical(row.names(spatcoord), as.character(1:dim(spatcoord)[1]))){ row.names(spatcoord)<-samples} # set up data frame to contain genotypes gentable<-data.frame(Ind=samples, Cat=popinfo[samples], spatcoord[samples,]) # find missing data misstable<-find.missing.gen(gendata[samples,loci], missing=missing) # get genotype data by locus (column) for(L in loci){ genotypesL<-gendata[samples,L] missingsamples<-misstable$Sample[misstable$Locus==L] # replace missing data with zeros genotypesL[missingsamples]<-0 # divide and subtract to convert to repeats divallele<-function(value){floor(value/usatnts[L])} genotypesL<-mapply(divallele,genotypesL,SIMPLIFY=FALSE) suballele<-function(value){if(value[1]!=0){ value-(10^(digits-1))} else{ 0}} while(max(mapply(max,genotypesL)) >= 10^digits){ genotypesL<-mapply(suballele, genotypesL, SIMPLIFY=FALSE) } # add zeros up to correct ploidy, delete alleles if necessary zerostoadd<-indploidies[samples] - mapply(length,genotypesL) names(zerostoadd)<-samples for(s in samples){ if(length(genotypesL[[s]])==1){ # replicate allele if totally homozygous genotypesL[[s]]<-rep(genotypesL[[s]],indploidies[s]) } else { if(zerostoadd[s] < 0){ # randomly remove alleles if there are too many genotypesL[[s]]<-sample(genotypesL[[s]],indploidies[s], replace=FALSE) cat("Alleles randomly removed to get to ploidy:",L,s,"\n") } else { # add zeros for partial heterozygotes genotypesL[[s]]<-c(genotypesL[[s]],rep(0,zerostoadd[s])) } } # also make each allele the right number of digits if necessary if(allelesep==""){ genotypesL[[s]]<-as.character(genotypesL[[s]]) for(a in 1:length(genotypesL[[s]])){ while(nchar(genotypesL[[s]][a]) < digits){ genotypesL[[s]][a]<-paste(0,genotypesL[[s]][a],sep="") } } } } # concatenate into strings genvect<-mapply(paste,genotypesL,collapse=allelesep) # add the vector to the data frame gentable<-data.frame(gentable,genvect) names(gentable)[dim(gentable)[2]]<-L } # write file write.table(gentable,file="SpagTemp.txt",sep="\t",row.names=FALSE, col.names=TRUE,quote=FALSE) cat(paste(length(samples),length(unique(popinfo[samples])),dim(spatcoord)[2], length(loci),digits,max(indploidies[samples]),sep="\t"),"0", readLines("SpagTemp.txt"),"END",sep="\n",file=file) }