
@HWI-XXXXX_0470:1:1:1761:946#0/1 NCCCTTTGGATTTGTCCTTTCAGATGCATTAGCCAT +HWI-XXXXX_0470:1:1:1761:946#0/1 BKNNJMHHEJRTRRR_______________Y[[[Y_ The first line, starting with “@”, and the third, starting with “+”, are descriptive, contain the name, slide number, etc. The second line is the read sequence of nucleotides, in our case there are 36 of them. The fourth line is the probabilities of reading the nucleotides. Below is a link to an exact description of the file format and how to get numbers from the letter representation http://en.wikipedia.org/wiki/FASTQ_format . /usr/src/bowtie-0.12.7/bowtie -q -v 2 -m 1 --best --strata -p 8 -S \ /usr/src/bowtie-0.12.7/indexes/hg19 - | samtools view -Sb - >${DIRNAME}/${SEQNAME}.bam @HD VN:1.0 SO:unsorted @SQ SN:chr10 LN:135374737 @SQ SN:chr11 LN:134452384 … @PG ID:Bowtie VN:0.12.7 CL:"/usr/src/bowtie-0.12.7/bowtie -q -v 1 -m 1 --best --strata -p 8 -S" USI-EAS21_8_3445_7_1_452_540 0 chr5 39195556 255 23M * 0 0 CTCCAGATTCTCTGGAAAGATGG SSSSSSPSSSSSSSSGNNSJSSL XA:i:0 MD:Z:23 NM:i:0 USI-EAS21_8_3445_7_1_170_242 0 chr14 73509925 255 23M * 0 0 CTGCATTAGACCTAGGCTTAGAA SSSSSSSSSSSSSSSSSSSGSNN XA:i:0 MD:Z:23 NM:i:0 USI-EAS21_8_3445_7_1_290_985 0 chr7 142613233 255 23M * 0 0 AGCTGACTGGCAAGCAACAGAGT JSSSSSSSSSSSPSSGSSLSNSL XA:i:0 MD:Z:23 NM:i:0 USI-EAS21_8_3445_7_1_881_711 4 * 0 0 * * 0 0 ATCGGAAGAGCTCGTATGCCGTC SSSSPSSSSSSSSSSSSSSSSNS XM:i:0 USI-EAS21_8_3445_7_1_897_318 0 chr3 50101427 255 23M * 0 0 GGGCGCAGAACCGCTGCTGCTGC SPSSSSSSSPPSSSSNSPPPLSL XA:i:1 MD:Z:15A7NM:i:1 
, will be
= S * W / G = S / N.
expresses the probability of the origin of a given number k of events over a period of time with a known average origin of events. The cumulative Poisson distribution will express the probability of occurrence of events of up to k. We will use the cumulative distribution, since we are interested in the probability that the number of reads in the window is less than k. Null hypothesis: k reads in a window by accident. Alternative hypothesis: k reads in the window is not accidental. The probability that our null hypothesis is true is even
. The closer P (k,
) to 0, the less likely that the reads hit the specified window by accident. In our example, we obtain that if the number of reads in the window is up to 15, then 1-P (15.0.876193) = 2.55351E-15, i.e. The number of reads up to 15 can enter a window 500 wide by chance with a probability of 2.55351E-15. This number is also called p value. Consequently, if there are more than 15 reads, then they are even less likely to enter the window by accident. Thus, in the program, you can set the condition: if p value <= 1.e16, then the data in the window should be left for consideration, the rest should be discarded. #ifndef PoissonHandler_H #define PoissonHandler_H #include <config.hpp> class PoissonHandler : public QState { Q_OBJECT private: GenomeDescription *sql_input; GenomeDescription *sam_input; QSettings setup; public: PoissonHandler(GenomeDescription *sql,GenomeDescription *sam,HandledData &output,QState *parent=0); ~PoissonHandler(); protected: virtual void onEntry(QEvent* event); }; #endif // #include "PoissonHandler.hpp" //------------------------------------------------------------- //------------------------------------------------------------- PoissonHandler::~PoissonHandler() { } //------------------------------------------------------------- //------------------------------------------------------------- PoissonHandler::PoissonHandler(GenomeDescription *sql,GenomeDescription *sam,HandledData &,QState * parent): QState(parent), sql_input(sql), sam_input(sam) { } //------------------------------------------------------------- //------------------------------------------------------------- void PoissonHandler::onEntry(QEvent*) { if(!setup.contains("AvgTagWindow")) setup.setValue("AvgTagWindow",500); if(!setup.contains("siteShift")) setup.setValue("siteShift",75); qint32 window=setup.value("AvgTagWindow").toInt(); qint32 half_window=0;//window/2; qint32 shift=setup.value("siteShift").toInt(); int c=0;//sense count, even/odd c sense/nonsense QMap<QString,int> poisson; QMapIterator<QChar,QMap<QString,QMap<qint32,qint32> > > sense(sql_input->genome); while(sense.hasNext()) { sense.next(); c++; QMapIterator<QString,QMap<qint32,qint32> > dictionary(sense.value()); while(dictionary.hasNext()) { dictionary.next(); QMapIterator<qint32,qint32> s(dictionary.value()); //sql result for strand+chromosome if((sam_input->genome['+']).contains(dictionary.key())) //chromosome chr1,chr2... { QMapIterator<qint32,qint32> k( (sam_input->genome['+'])[dictionary.key()] );//+ strand,sam file, iteration over strand+chromosome s.next(); //c++; k.next(); while (k.hasNext()) // + strand, iteration over strand+chromosome+each position { //if align is inside of the segment of the start site qint32 x1=k.key()- s.key() +half_window +shift; qint32 x2=x1-window; if( (x1^x2) < 0 ) {//inside segment poisson[((sql_input->genome_h[sense.key()])[dictionary.key()])[s.key()]]+=k.value(); k.next(); } else { if((k.key() +shift)>(s.key()+half_window)) { if(!s.hasNext()) break; s.next();// c++; } else { k.next(); } } }//while k.hasNext }//if + strand sam input if((sam_input->genome['-']).contains(dictionary.key())) { QMapIterator<qint32,qint32> j( (sam_input->genome['-'])[dictionary.key()] );//- strand,sam file s.toFront(); s.next(); j.next(); while (j.hasNext()) // - strand { //if align is inside of the segment of the start site qint32 x1=j.key()- s.key() +half_window -shift; qint32 x2=x1-window; if( (x1^x2) < 0 ) {//inside segment poisson[((sql_input->genome_h[sense.key()])[dictionary.key()])[s.key()]]+=j.value(); j.next(); } else { if((j.key() -shift)>(s.key()+half_window)) { if(!s.hasNext()) break; s.next(); } else { j.next(); } } } }//if - strand sam file }//while dictionary }//while sense //Poisson { qreal lamda=(qreal)(window*(sam_input->total-sam_input->notAligned))/(sam_input->tot_len*0.741); QFile _outFile; _outFile.setFileName(setup.value("outFileName").toString()); _outFile.open(QIODevice::WriteOnly|QIODevice::Truncate); qreal exp_l=qExp(-lamda); QString str; foreach(str,poisson.keys()) { //P(x,L)=(e**-1)*(L**x)/x! // qreal pois=exp_l; //qExp(-lamda)*qPow(lamda,poisson[str]); // for(int i=2; i<= poisson[str];i++) // pois=pois*lamda/i; //P(x,L)=(e**-1)*E(1-k)(L**x(i))/x(i)! qreal pois=exp_l; qreal delim=1.0; qreal sum=1.0; for(int i=1; i<= poisson[str];i++) { delim*=lamda/i; sum+=delim; } pois*=sum; _outFile.write( (QString("%1\t%2\t%3\n").arg(str).arg(poisson[str]).arg(pois,0,'e',20)).toLocal8Bit() ); } _outFile.flush(); _outFile.close(); }//if(1) }//end of function //------------------------------------------------------------- //------------------------------------------------------------- Source: https://habr.com/ru/post/137267/
All Articles