#Script to perform interval matching of short read sequence intervals and predicted gene models (Glyma1.01 genome assembly). clustercount<-function(j,filename){ #sapply allows for intervals to be used in the function sapply(j,function(i){ temp1<-solexa[,4:5] tempsoymap<-soymap[i,4:5] temp3<-temp1[which((temp1[,1]<=tempsoymap[[1]] & temp1[,2]>=tempsoymap[[2]]) | (temp1[,1]<=tempsoymap[[2]] & temp1[,2]>=tempsoymap[[2]]) | (temp1[,1]>=tempsoymap[[1]] & temp1[,1]<=tempsoymap[[2]]) | (temp1[,2]>=tempsoymap[[1]] & temp1[,2]<=tempsoymap[[2]])),] write.table(cbind(soymap[i,],dim(temp3)[1]), file=filename, append=T,quote=F,col.names = F) }) } library(intervals) #read in data soyin<-read.table('~/solexa/glymacoords/Glyma1.col5') solexain<-read.table('./solexa.col5') names<-as.matrix(read.table('./names.txt')) #set counters countNomap=0 #For each chromosome/scaffold set up each soy and solexa variable #and then proceed processing to the same file each time #for (n in 21:21){ for (n in 1:length(names)){ print(n) print(names[n]) #take only those sequences and gene models that correspond with the chromosome/scaffold solexa<-solexain[which(solexain[,3]==names[n]),] soy<-soyin[which(soyin[,3]==names[n]),] #check to see if there are any solexa and soy sequences for this scaffold #this will not be an issue if you create the names file from the solexa data each time #however for testing purposes on small sets I put it here soyinlength=length(which(soyin[,3]==names[n])) solexainlength=length(which(solexain[,3]==names[n])) if (solexainlength==0 & soyinlength==0){ next } #check to see of there are any solexa sequences for this chromosome/scaffold #if there aren't write out the gene models with 0 counts to the outfile and go to #the next chromosome/scaffold if (solexainlength==0){ soy2<-as.data.frame(soy) soy2$V6=0 #map count -- how many solexa reads map to this model soy2$V7=soy$V4 soy2$V8=soy$V5 write.table(soy2, file="GlymaIDCounts.txt", append=T,quote=F,col.names = F) next } #check to see if there are solexa sequences and see if there are #gene models for this chromosome/scaffold. If no genes then write out the #solexa sequences to a no map to genemodel output file if (soyinlength==0 & solexainlength!=0){ print(paste("no gene models for",names[n])) countNomap=countNomap+dim(solexa)[1] write.table(solexa, file="GSNAPIDnoMap.txt", append=T,quote=F,col.names = F) next } #Now that we know we have both solexa reads and gene models for this chromosome/scaffold #convert the data to intervals for comparison soyI<-Intervals(soy[,4:5]) solexaI<-Intervals(solexa[,4:5]) #find those in the solexa data that don't map to gene models and write them out to a separte file d2nsolexa<-distance_to_nearest(solexaI,soyI) if (length(which(d2nsolexa>0))>0){ solexa[which(d2nsolexa>0),]->solexanomap countNomap=countNomap+dim(solexanomap)[1] write.table(solexanomap, file="GSNAPIDnoMap.txt", append=T,quote=F,col.names = F) } #find those in the gene models that are mapped to the solexa data d2nsoy<-distance_to_nearest(soyI,solexaI) soy[which(d2nsoy>0),]->soynomap #those gene models that do not map soy[which(d2nsoy==0),]->soymap #those gene models that do map to the solexa data #count how many of the solexa reads map to the gene model #if there is > 0 that map then run clustercount defined above that accounts #for sequences that overlap partially with the gene model by expanding the gene model soyany<-length(which(d2nsoy==0)) #how many map to the solexa data if (soyany != 0){ clustercount(1:dim(soymap)[1],"GlymaIDCounts.txt") } #output the nomap glymaID and Glymacoords in the same format with a 0 for the map count if(length(which(d2nsoy>0))!=0){ soynomap2<-as.data.frame(soynomap) soynomap2$V6=0 #map count -- how many solexa reads map to this model soynomap2$V7=soynomap$V4 soynomap2$V8=soynomap$V5 write.table(soynomap2, file="GlymaIDCounts.txt", append=T,quote=F,col.names = F) } print(paste("countNomap",countNomap)) }