import java.io.*; public class Biased { public static class indexPair { public final int fst, snd; public indexPair(int fst, int snd) { this.fst = fst; this.snd = snd; }//constructor public String toString() { return( "(" + fst + "," + snd + ")" ); } }//indexPair class // ---------------------------------------------------------------------- // ---------------------------------------------------------------------- // CONDITIONS OF USE: Provided that you | // (a) cite: | // L.Allison, | // Longest Biased Interval and Longest Non-Negative Sum Interval, | // Bioinformatics, V19, #10, pp1294-1295, July 2003, | // if this code is used in, or contributes | // towards, published work, and | // (b) maintain this notice, then | // (c) the code is available to you under the | // GNU General Public Licence (GPL) | // ( see http://www.gnu.org/copyleft/gpl.html ), | // otherwise the code is not available for use by you. | // -- L.A. 30/1/2003, 10/2003. | // ---------------------------------------------------------------------- // ---------------------------------------------------------------------- // ---------------------------------------------------------------------------- //----------------------------------------------------------------------- //----------------------------------------------------------------------- // First algorithm as per | // J. Bioinformatics, 19(10), pp1294-1295, 1 July 2003 | // It is almost always linear time. | //----------------------------------------------------------------------- //----------------------------------------------------------------------- // Find the longest bi@sed.interval in `seq[]', // i.e. that is at least `fraction' made up of chars in `select', // and that is as long as possible. public static indexPair // longestBiased1(...) longestBiased1( String select, double fraction, char[] seq ) { double[] fstMinVal = new double[seq.length+1]; // new minimum values and int[] fstMinPos = new int [seq.length+1]; // their positions. double cumSum = 0, lastCumSum = 0; // cumulative sum & previous value int begins = 0, ends = -1, // best interval so far tracker = 0, // tracks fstMin...[] numMin = 1; // effective size of fstMin...[] fstMinVal[0] = 0; fstMinPos[0] = -1; // seq[0..-1] is the empty interval fraction = fraction / ( 1 + 0.5/seq.length ); // + a little "slack" if( fraction > 1 ) fraction = 1; // sanity! else if( fraction < 0 ) fraction = 0; double good = 1-fraction; double bad = -fraction; int maxSelected = 0; // make boolean array for O(1)-time test of chars for( int i = 0; i < select.length(); i++ ) if( (int) select.charAt(i) > maxSelected ) maxSelected = (int) select.charAt(i); boolean[] selected = new boolean[maxSelected+1]; for( int i = 0; i <= maxSelected; i++ ) selected[i] = select.indexOf((char) i) >= 0; for( int i = 0; i < seq.length; i++ ) { // INV: seq[begins..ends] is longest bi@sed.interval of seq[0..i-1], // fstMinVal[] is strictly decreasing and with // fstMinPos[] shows the 1st occurences of minima. int seqI = (int) seq[i]; cumSum += (seqI <= maxSelected && selected[seqI]) ? good : bad; if( cumSum < fstMinVal[numMin-1] ) // a new, lower minimum { fstMinVal[numMin] = cumSum; fstMinPos[numMin] = i; numMin++; } // NB. fstMinVal[] is in *strictly* decreasing order. if( cumSum >= lastCumSum ) // it's increasing, getting better while( tracker > 0 && fstMinVal[tracker-1] <= cumSum ) tracker-- ; // tracker == 0 or fstMinVal[tracker-1] > cumSum else // cumSum < lastCumSum it's decreasing, getting worse while( tracker < numMin-1 && fstMinVal[tracker] > cumSum ) tracker++ ; // tracker == numMin-1 or fstMinVal[tracker] <= cumSum // (tracker == 0 or fstMinVal[tracker-1] > cumSum) and // (tracker == numMin-1 or fstMinVal[tracker] <= cumSum) // In fact, tracker is least index s.t. fstMinVal[tracker] <= cumSum, // so seq[fstMinPos[tracker]+1 .. i] is a candidate interval. if( i-fstMinPos[tracker] > ends-begins+1 ) // ? new longest ? { begins = fstMinPos[tracker]+1; ends = i; } lastCumSum = cumSum; }//for i // seq[begins..ends] is longest bi@sed.interval of seq[0..seq.length-1] return( new indexPair(begins, ends) ); }//longestBiased1 // ---------------------------------------------------------------------------- // ---------------------------------------------------------------------- // ---------------------------------------------------------------------- // ... and here is code for a 3-pass *always* linear-time algorithm | // that was suggested by Sung Kwon Kim [skkim at cau.ac.kr] in | // personnal communication, 28 July, 12 Aug 2003 ... | // ---------------------------------------------------------------------- // ---------------------------------------------------------------------- // Find the longest bi@sed.interval in `seq[]', // i.e. that is at least `fraction' made up of chars in `select', // and that is as long as possible. public static indexPair // longestBiased2(...) longestBiased2( String select, double fraction, char[] seq ) { fraction = fraction / ( 1 + 0.5/seq.length ); // + a little "slack" if( fraction > 1 ) fraction = 1; // sanity! else if( fraction < 0 ) fraction = 0; double good = 1-fraction; double bad = -fraction; int maxSelected = 0; // make boolean array for O(1)-time test of chars for( int i = 0; i < select.length(); i++ ) if( (int) select.charAt(i) > maxSelected ) maxSelected = (int) select.charAt(i); boolean[] selected = new boolean[maxSelected+1]; for( int i = 0; i <= maxSelected; i++ ) selected[i] = select.indexOf((char) i) >= 0; // 1st pass // btw. double is better than float for veryy longgg sequences double[] cumSums = new double[seq.length]; // cumulative sums double cumSum = 0; for( int i = 0; i < seq.length; i++ ) { int seqI = (int) seq[i]; cumSum += (seqI <= maxSelected && selected[seqI]) ? good : bad; cumSums[i] = cumSum; }//for i // 2nd pass (reverse) int[] lastAbove = new int[seq.length]; // last maxima of cumSums[] int finalDescent = seq.length-1; for( int i = seq.length-1; i >= 0; i-- ) { if( cumSums[i] > cumSums[finalDescent] ) finalDescent = i; // finalDescent is the right-most index to // maximize cumSums[i..seq.length-1] lastAbove[i] = finalDescent; }//for i // NB. cumSums[lastAbove[i]] is montonic, flat or decreasing, // as i increases. //3rd pass int j = -1; double cumSumI1 = 0; int begins = 0, ends = -1; // so far, best interval is empty for( int i = 0; i < seq.length; i++ ) { // cumSumI1 = SUM scores of seq[0..i-1] // It is sufficient to consider intervals [i..j] where i is a // new "first below" minimum and j is a "last above" maximum. while( j < seq.length-1 && cumSums[lastAbove[j+1]] >= cumSumI1 ) j = lastAbove[j+1]; if( j - i > ends - begins ) { begins = i; ends = j; } cumSumI1 = cumSums[i]; }//for i return( new indexPair(begins, ends) ); }//longestBiased2 // ---------------------------------------------------------------------------- public static char[] readSeq(String fileName, String dontIgnore) { StringBuffer sb = new StringBuffer(); try{ File inputFile = new File( fileName ); FileReader in = new FileReader(inputFile); int c; while((c = in.read()) != -1) { if( dontIgnore.indexOf(c) < 0 ) continue; char ch = (char) c; sb.append( ch ); } } catch(Exception e) { System.out.println( "failed: " + e.getMessage() ); System.exit(1); } String s = sb.toString(); char[] arr = new char[s.length()]; for( int i = 0; i < s.length(); i++ ) arr[i] = s.charAt(i); return arr; }//readSeq public static indexPair run( String select, double fraction, char[] seq ) // Do a scan of the String at the given fraction and print the findings. { int cutoff = 20; //--------------------------------------------------------------------- // Comment one of these out, and also make consequent changes, if you | // only want to run one of the algorithms ... | //--------------------------------------------------------------------- indexPair interval1 = longestBiased1( select, fraction, seq ); indexPair interval2 = longestBiased2( select, fraction, seq ); if( interval1.snd - interval1.fst != interval2.snd - interval2.fst ) System.out.print( "DISAGREEMENT "); int len = interval1.snd - interval1.fst + 1; System.out.print( " >=" + fraction + ": [" + interval1.fst + ".." + interval1.snd + "] (" + len + ") " ); if( len <= cutoff ) // short, print all characters { for(int i = interval1.fst; i <= interval1.snd; i++ ) System.out.print( seq[i] ); } else // long, print first few and last few characters { for(int i = interval1.fst; i < interval1.fst + cutoff/2; i++) System.out.print( seq[i] ); System.out.print( "..." ); for(int i = interval1.snd - cutoff/2 + 1; i <= interval1.snd; i++) System.out.print( seq[i] ); } System.out.print( " [" + + interval2.fst + ".." + interval2.snd + "]" ); System.out.println(); return interval1; }//run public static void main(String[] argv) { System.out.println("#--- Biased.java, L.A., CSSE, Monash, .au ---"); for(int i=0; i < argv.length; i++) // command line params if any System.out.println("# argv[" + i + "] = " + argv[i]); String select = ""; double fraction = 0.5; String fileName = ""; try { select = argv[0]; fraction = new Double(argv[1]).doubleValue(); // read the min' bi@s fileName = argv[2]; } catch(Exception e) { System.out.println("usage: java prog_name \"chars\" fraction filename"); System.exit(1); } char[] arr = readSeq( fileName, "acgtuACGTUrnyRNY" ); // INPUT THE STRING // NB. you may want to change the set of characters that are not ignored. System.out.print("# " + arr.length + "bp"); System.out.print(" "); for(int i=0; i < (arr.length < 65 ? arr.length : 65); i++) System.out.print( arr[i] ); System.out.println( arr.length > 65 ? "..." : "" ); if( fraction >= 0 && fraction <= 1 ) // a "proper" fraction run( select, fraction, arr ); else // specified fraction > 1 or < 0; take this as a flag to try several // decreasing fractions until we get the whole string, and stop. for( int ifrac = 100; ifrac > 0; ifrac -= 5 ) { indexPair interval = run( select, ifrac/100.0, arr ); if( interval.snd - interval.fst + 1 >= arr.length ) break; } System.out.println("#--- end ---"); }//main }//Biased class // --------------------------------------------------------------------- // 10/2002, 1/2002, 7/2003, 10/2003 | // L.Allison, CSSE, Monash University, .au | // http://www.csse.monash.edu.au/~lloyd/tildeProgLang/Java2/Biased/ | // Note the ``conditions of use'' in the code above. | // ---------------------------------------------------------------------