###### load parallel library
library("parallel")
mc.cores.user<-60

###### load BSgenome.Hsapiens package
library("BSgenome.Hsapiens.UCSC.hg19")

###### define the ligation function
ligation.function<-function(chr.1,chr.2,frag.1,frag.2,ligation.type,seqlength){
        if (ligation.type!="none" & diff(frag.1)>=(seqlength-1) & diff(frag.2)>=(seqlength-1)){
                a<-toString(reverseComplement(subseq(Hsapiens[[chr.1]],frag.1[1],frag.1[1]+seqlength-1)))
                b<-toString(subseq(Hsapiens[[chr.1]],frag.1[2]-seqlength+1,frag.1[2]))
                c<-toString(reverseComplement(subseq(Hsapiens[[chr.2]],frag.2[1],frag.2[1]+seqlength-1)))
                d<-toString(subseq(Hsapiens[[chr.2]],frag.2[2]-seqlength+1,frag.2[2]))

                temp<-strsplit(ligation.type,split="")[[1]]

                return(c(get(temp[1]),get(temp[2])))
        }
        else{
                return(NA)
        }
}

##### set number of samples and other settings
mean.frag.length<-250
n.cells<-1E5
lp<-0.8
seqlength<-50

ligation<-c("ab","cd","ac","bd","ad","bc","none")
p.ligation<-c(rep((1-(1-lp)^2)/6,6),(1-lp)^2)

###### get chromosome lengths
chr.length<-sapply(1:23,function(i){length(Hsapiens[[i]])})

###### get TFBS information
TFBS<-read.csv("~/chiapet_simulation/chipBase_Human_TFBSs_ERa_MCF7.csv",stringsAsFactors=FALSE)
TFBS<-TFBS[TFBS$chrom!="chrY",]
TFBS$midpoint<-(TFBS$chromStart+TFBS$chromEnd)/2

###### get TSS information
# read in annotation, remove isoforms with multiple gene symbols or no gene symbols
TSS<-read.delim("~/chiapet_simulation/GRCh37_refgene_20140512.txt",comment.char="#",header=FALSE,stringsAsFactors=FALSE)
temp<-strsplit(TSS[,6],",")
TSS<-TSS[(sapply(temp,length)==1) & (sapply(temp,"[",1)!="n/a"),]
TSS[,6]<-gsub(",","",TSS[,6])

# remove "complicated" genes
exclude.gene.1<-unique(TSS[grepl("_|M|Un|Y",TSS[,2]),6])
temp<-by(TSS[,3],TSS[,6],function(x){length(unique(x))})
exclude.gene.2<-names(temp)[temp>1]

TSS<-TSS[!(TSS[,6] %in% union(exclude.gene.1,exclude.gene.2)),]

# get TSS information
TSS[,7]<-ifelse(TSS[,3]=="+",TSS[,4]+1,TSS[,5])
TSS<-unique(TSS[,c(2,6,7)])
colnames(TSS)<-c("chr","gene","TSS")

###### select interested interaction sites
set.seed(2008)

TFBS.selected<-mat.or.vec(23,2); TFBS.selected.id<-mat.or.vec(23,2)
TSS.selected<-mat.or.vec(23,6); TSS.selected.id<-mat.or.vec(23,6)

for (i in 1:23){
	chr.idx<-ifelse(i==23,"X",i)

	temp<-sample(which(TFBS$chrom==paste("chr",chr.idx,sep="")),2)
	TFBS.selected[i,]<-TFBS$midpoint[temp]
	TFBS.selected.id[i,]<-TFBS$name[temp]

	temp<-sample(which(TSS$chr==paste("chr",chr.idx,sep="")),2)
	TSS.selected[i,1:2]<-TSS$TSS[temp]
	TSS.selected.id[i,1:2]<-TSS$gene[temp]

	TSS.selected[i,3:6]<-sample.int(chr.length[i],4)
	TSS.selected.id[i,3:6]<-rep("No_gene",4)
}

######## make pairings
temp.1<-expand.grid(1:23,1:23,1:2,1:2)
temp.2<-expand.grid(1:23,1:23,1:2,3:6)

pairing<-rbind(temp.1,temp.2)
pairing[,5]<-sapply(1:nrow(pairing),function(i){return(TFBS.selected[pairing[i,1],pairing[i,3]])})
pairing[,6]<-sapply(1:nrow(pairing),function(i){return(TSS.selected[pairing[i,2],pairing[i,4]])})
pairing.gene<-sapply(1:nrow(pairing),function(i){return(TSS.selected.id[pairing[i,2],pairing[i,4]])})

PT<-sapply(1:nrow(temp.1),function(i){ifelse(temp.1[i,3]==temp.1[i,4],"TH","TL")})
PT<-c(PT,rep("F",23^2*8))

####### set sampling probabilities so that TH:TL:F=6:3:1
temp<-1/(6*23^2*2+3*23^2*2+23^2*8)
sampling.prob<-ifelse(PT=="TH",6*temp,ifelse(PT=="TL",3*temp,temp))

###### sample the pairs
sample.index<-sample(1:6348,n.cells,sampling.prob,replace=T)
sample.ligation<-ligation[sample(1:7,n.cells,p.ligation,replace=T)]

###### perform sonications to get the PET reads
sonication.result<-mclapply(1:n.cells,function(idx.cell){
						i<-sample.index[idx.cell]

						chr.1<-pairing[i,1]; pt.1<-pairing[i,5]
						chr.2<-pairing[i,2]; pt.2<-pairing[i,6]
								
						num.frag.1<-rpois(1,chr.length[chr.1]/mean.frag.length)
                	        		sonicate.pt.1<-c(0,sample.int(chr.length[chr.1],num.frag.1),chr.length[chr.1])
		                	        frag.1<-c(max(sonicate.pt.1[sonicate.pt.1<pt.1])+1,min(sonicate.pt.1[sonicate.pt.1>pt.1]))
						
						if (pairing[i,1]==pairing[i,2]){
							frag.2<-c(max(sonicate.pt.1[sonicate.pt.1<pt.2])+1,min(sonicate.pt.1[sonicate.pt.1>pt.2]))
						}
						else{
							num.frag.2<-rpois(1,chr.length[chr.2]/mean.frag.length)
							sonicate.pt.2<-c(0,sample.int(chr.length[chr.2],num.frag.2),chr.length[chr.2])
							frag.2<-c(max(sonicate.pt.2[sonicate.pt.2<pt.2])+1,min(sonicate.pt.2[sonicate.pt.2>pt.2]))
						}
			
						return(ligation.function(chr.1,chr.2,frag.1,frag.2,sample.ligation[idx.cell],seqlength))
					},mc.cores=mc.cores.user)				

###### remove redundant sonication results
sonication.result<-sonication.result[!sapply(sonication.result,function(i){any(is.na(i))})]

PET.seq1<-sapply(sonication.result,"[",1)
PET.seq2<-sapply(sonication.result,"[",2)

idx<-!(grepl("#",PET.seq1) | grepl("#",PET.seq2))
PET.seq1<-PET.seq1[idx]
PET.seq2<-PET.seq2[idx]

###### make two DNAStringSet objects to contain the PET's
PET.seq1<-DNAStringSet(PET.seq1)
PET.seq2<-DNAStringSet(PET.seq2)

names(PET.seq1)<-1:length(PET.seq1)
names(PET.seq2)<-1:length(PET.seq2)

###### map the PET's to genome
seqnames<-seqnames(Hsapiens)
new.matchpdict<-function(dict0,strand,mc.cores.user){
	if (strand == "-"){
               dict0<-reverseComplement(dict0)
      }
	dict<-PDict(dict0)

	list.table<-mclapply(seqnames,function(seqname){
				subject<-Hsapiens[[seqname]]
				m<-extractAllMatches(subject, matchPDict(dict, subject))
				return(data.frame(as.integer(names(m)),rep(seqname,length(m)),start(m),end(m),stringsAsFactors=FALSE))	
			},mc.cores=mc.cores.user)
	table<-data.frame(do.call(rbind,list.table),strand,stringsAsFactors=FALSE)
	colnames(table)<-c("PatternID","chromosome","start","end","strand")
	return(table)
}

match.seq1.plus<-new.matchpdict(PET.seq1,strand="+",mc.cores.user=mc.cores.user)
match.seq1.minus<-new.matchpdict(PET.seq1,strand="-",mc.cores.user=mc.cores.user)
match.seq1<-rbind(match.seq1.plus,match.seq1.minus)

match.seq2.plus<-new.matchpdict(PET.seq2,strand="+",mc.cores.user=mc.cores.user)
match.seq2.minus<-new.matchpdict(PET.seq2,strand="-",mc.cores.user=mc.cores.user)
match.seq2<-rbind(match.seq2.plus,match.seq2.minus)

###### remove non-unique alignments
temp.1<-table(match.seq1$PatternID)
temp.2<-table(match.seq2$PatternID)
include.pattern<-as.integer(intersect(names(temp.1)[temp.1==1],names(temp.2)[temp.2==1])) 

match.seq1<-match.seq1[match.seq1$PatternID %in% include.pattern,]
match.seq1<-match.seq1[order(match.seq1$PatternID),]

match.seq2<-match.seq2[match.seq2$PatternID %in% include.pattern,]
match.seq2<-match.seq2[order(match.seq2$PatternID),]

##### distinguish self-loops from inter-loops
dist<-unlist(mclapply(1:nrow(match.seq1),function(i){
						if (match.seq1$chromosome[i]==match.seq2$chromosome[i]){
							return(abs(match.seq1$start[i]-match.seq2$start[i])+seqlength)
						}
						else {
							return(0)
						}
					},mc.cores=mc.cores.user))
temp<-kmeans(log(dist[dist>0]),2)
self.idx<-ifelse(temp$centers[1]>temp$centers[2],as.integer(2),as.integer(1))

group<-ifelse(dist==0,"inter.diff","same")
group[group=="same"]<-ifelse(temp$cluster==self.idx,"self","inter.same")

###### create GRanges
match.seq1<-match.seq1[group!="self",]
match.seq2<-match.seq2[group!="self",]

names(chr.length)<-paste("chr",c(1:22,"X"),sep="")

gr.seq1<-GRanges(seqnames=match.seq1$chromosome,ranges=IRanges(start=match.seq1$start,end=match.seq1$end),strand=match.seq1$strand,seqlengths=chr.length)
gr.seq2<-GRanges(seqnames=match.seq2$chromosome,ranges=IRanges(start=match.seq2$start,end=match.seq2$end),strand=match.seq2$strand,seqlengths=chr.length)

##### extend to mean.frag.length
gr.seq1<-resize(gr.seq1,mean.frag.length,fix="end")
gr.seq2<-resize(gr.seq2,mean.frag.length,fix="end")

##### find interaction anchors, counts and marginal counts (mc)
all.gr<-reduce(c(gr.seq1,gr.seq2),ignore.strand=TRUE)

hit.seq1<-findOverlaps(gr.seq1,all.gr)
hit.seq2<-findOverlaps(gr.seq2,all.gr)

temp<-cbind(subjectHits(hit.seq1),subjectHits(hit.seq2))
temp.1<-temp[subjectHits(hit.seq2)>subjectHits(hit.seq1),]
temp.2<-temp[subjectHits(hit.seq2)<subjectHits(hit.seq1),2:1]

raw.table<-rbind(temp.1,temp.2)
ones<-rep(1,nrow(raw.table))
count.table<-aggregate(ones,by=list(raw.table[,1],raw.table[,2]),sum)

temp<-paste(seqnames(all.gr),":",start(all.gr),"-",end(all.gr),sep="")
count.table[,4]<-temp[count.table[,1]]
count.table[,5]<-temp[count.table[,2]]

temp.reference<-table(as.vector(raw.table))
count.table[,6]<-unlist(mclapply(1:nrow(count.table),function(i){
					return(temp.reference[as.character(count.table[i,1])]+temp.reference[as.character(count.table[i,2])])
				},mc.cores=mc.cores.user))
colnames(count.table)<-c("frag.1.id","frag.2.id","count","frag.1","frag.2","mc")

##### get distances
#all.gr, TFBS, TSS
all.gr.mid<-(start(all.gr)+end(all.gr))/2
all.gr.chr<-as.character(seqnames(all.gr))
all.gr.start<-as.numeric(start(all.gr)); all.gr.end<-as.numeric(end(all.gr))

temp.TFBS<-mclapply(1:length(all.gr),function(i){
					temp<-TFBS[TFBS$chrom==all.gr.chr[i],]
					dist<-abs(temp$midpoint-all.gr.mid[i])

					dist.i<-min(dist)
					idx.i<-which.min(dist)
	
					return(c(dist.i,temp$midpoint[idx.i]))
				},mc.cores=mc.cores.user)

temp.TSS<-mclapply(1:length(all.gr),function(i){
                                        temp<-TSS[TSS$chr==all.gr.chr[i],]
                                        dist<-abs(temp$TSS-all.gr.mid[i])
                                        
                                        dist.i<-min(dist)
                                        idx.i<-which.min(dist)
                                
                                        return(list(dist.i,temp$gene[idx.i]))
                                },mc.cores=mc.cores.user)

pairing<-data.frame(pairing)
pairing[,1]<-paste("chr",ifelse(pairing[,1]==23,"X",pairing[,1]),sep="")
pairing[,2]<-paste("chr",ifelse(pairing[,2]==23,"X",pairing[,2]),sep="")

mini.distance<-mclapply(1:nrow(count.table),function(i){
						index.gr.1<-count.table$frag.1.id[i]; index.gr.2<-count.table$frag.2.id[i];

						chr.index.1<-match(all.gr.chr[index.gr.1],paste("chr",c(1:22,"X"),sep=""))
                                                start.1<-all.gr.start[index.gr.1]; end.1<-all.gr.end[index.gr.1];
                                                TFBS.yn.11<-(start.1<=TFBS.selected[chr.index.1,1] & TFBS.selected[chr.index.1,1]<=end.1)
                                                TFBS.yn.12<-(start.1<=TFBS.selected[chr.index.1,2] & TFBS.selected[chr.index.1,2]<=end.1)
                                                TSS.yn.11<-(start.1<=TSS.selected[chr.index.1,1] & TSS.selected[chr.index.1,1]<=end.1)
                                                TSS.yn.12<-(start.1<=TSS.selected[chr.index.1,2] & TSS.selected[chr.index.1,2]<=end.1)

                                                chr.index.2<-match(all.gr.chr[index.gr.2],paste("chr",c(1:22,"X"),sep=""))
                                                start.2<-all.gr.start[index.gr.2]; end.2<-all.gr.end[index.gr.2];
                                                TFBS.yn.21<-(start.2<=TFBS.selected[chr.index.2,1] & TFBS.selected[chr.index.2,1]<=end.2)
                                                TFBS.yn.22<-(start.2<=TFBS.selected[chr.index.2,2] & TFBS.selected[chr.index.2,2]<=end.2)
                                                TSS.yn.21<-(start.2<=TSS.selected[chr.index.2,1] & TSS.selected[chr.index.2,1]<=end.2)
                                                TSS.yn.22<-(start.2<=TSS.selected[chr.index.2,2] & TSS.selected[chr.index.2,2]<=end.2)

						a<-(temp.TFBS[[index.gr.1]][1]+temp.TSS[[index.gr.2]][[1]])
						b<-(temp.TSS[[index.gr.1]][[1]]+temp.TFBS[[index.gr.2]][1])

						if (a>b){
							idx.TSS<-(pairing[,2]==all.gr.chr[index.gr.1] & pairing.gene==temp.TSS[[index.gr.1]][[2]])
							idx.TFBS<-(pairing[,1]==all.gr.chr[index.gr.2] & as.numeric(pairing[,5])==temp.TFBS[[index.gr.2]][2])
							
							if ((TFBS.yn.11 & TSS.yn.21) | (TFBS.yn.12 & TSS.yn.22)| (TSS.yn.11 & TFBS.yn.21) | (TSS.yn.12 & TFBS.yn.22)){
	                                                        return(c(b,"PB","TH"))
        	                                        }
                	                                else if ((TFBS.yn.11 & TSS.yn.22) | (TFBS.yn.12 & TSS.yn.21)| (TSS.yn.11 & TFBS.yn.22) | (TSS.yn.12 & TFBS.yn.21)){
                        	                                return(c(b,"PB","TL"))
                                	                }
							else if (sum(idx.TSS & idx.TFBS)==1) {
								return(c(b,"PB",PT[idx.TSS & idx.TFBS]))
							}	
							else {
								return(c(b,"PB","F"))
							}	
						}
						else {
							idx.TFBS<-(pairing[,1]==all.gr.chr[index.gr.1] & as.numeric(pairing[,5])==temp.TFBS[[index.gr.1]][2])
                                                        idx.TSS<-(pairing[,2]==all.gr.chr[index.gr.2] & pairing.gene==temp.TSS[[index.gr.2]][[2]])
			
							if ((TFBS.yn.11 & TSS.yn.21) | (TFBS.yn.12 & TSS.yn.22)| (TSS.yn.11 & TFBS.yn.21) | (TSS.yn.12 & TFBS.yn.22)){
                                                                return(c(a,"BP","TH"))
                                                        }
                                                        else if ((TFBS.yn.11 & TSS.yn.22) | (TFBS.yn.12 & TSS.yn.21)| (TSS.yn.11 & TFBS.yn.22) | (TSS.yn.12 & TFBS.yn.21)){
                                                                return(c(a,"BP","TL"))
                                                        }
							else if (sum(idx.TSS & idx.TFBS)==1) {
                                                                return(c(a,"BP",PT[idx.TSS & idx.TFBS]))
                                                        }
                                                        else {
                                                                return(c(a,"BP","F"))
							}
						}
					},mc.cores=mc.cores.user)

count.table$mini.distance<-as.numeric(sapply(mini.distance,"[",1))
count.table$type<-sapply(mini.distance,"[",2)
count.table$LT<-sapply(mini.distance,"[",3)

write.csv(count.table,"~/chiapet_simulation/data.csv",row.names=F,quote=F)

q("no")
