/***************************************************************/ /* /* Gene Comparison Program /* Copyright (c) 1992-1993 by Atul Butte and Paolo DePetrillo /* /***************************************************************/ #include #include #include #include #include #define MAX( a, b ) ( ( (a) > (b) ) ? (a) : (b) ) #define MIN( a, b ) ( ( (a) > (b) ) ? (b) : (a) ) typedef char BASE; FILE * inFile; FILE * outFile; BASE * seq1 = NULL; BASE * seq2 = NULL; typedef struct s_store { int frameShift; int maxLengthMatches; int compareAtMax; int where1; int where2; int length; double D; } store; store * store_array = NULL; store store_temp; int * compare = NULL; void exit_cleanup( ) { if( store_array != NULL ) free( store_array ); if( compare != NULL ) free( compare ); if( seq1 != NULL ) free( seq1 ); if( seq2 != NULL ) free( seq2 ); printf( "cleaned\n" ); fflush( stdout ); } double get_slope( int maxLengthMatches, int * compare, char * valid ) { register double sum_x = 0.0; register double sum_y = 0.0; register double sum_x_times_y = 0.0; register double sum_x_squared = 0.0; int i; int occur; register double log_occur; int windowSize; if( maxLengthMatches == 1 ) { *valid = 0; return 0; } windowSize = 0; do { windowSize++; occur = 0; for( i = windowSize; i <= maxLengthMatches; i++ ) { occur += ( i - windowSize + 1 ) * compare[i]; } if( maxLengthMatches == 1 ) { maxLengthMatches = 1; } if( occur != 0 ) { log_occur = log( (double) occur ); sum_x += (double) windowSize; sum_x_squared += (double) ( windowSize * windowSize ); sum_y += (double) log_occur; sum_x_times_y += (double) ( windowSize * log_occur ); } else { windowSize--; } } while( occur != 0 ); *valid = 1; return (double) (sum_x * sum_y - windowSize * sum_x_times_y ) / ( sum_x * sum_x - windowSize * sum_x_squared ); } int mod_codon( int frame ) { int t; int rem; if( frame < 0 ) { t = - frame; rem = t % 3; if( rem == 1 ) rem = 2; else if( rem == 2 ) rem = 1; return rem; } else return frame % 3; } read_files( int argc, char *argv[] ) { char filename[60]; int i; int memneed; i = 1; do { sprintf( filename, "results%d", i ); outFile = fopen( filename, "r" ); if( outFile != NULL ) fclose( outFile ); i++; } while( outFile != NULL ); outFile = fopen( filename, "w" ); // temp = fopen( "graphme", "w" ); printf( "Gene Comparison Program\nby Atul Butte\n" ); printf( "Copyright (c) 1992-1993 by Atul Butte and Paolo DePetrillo\n\n" ); printf( "Results will be placed in the file \"%s\"\n\n", filename ); fprintf( outFile, "Gene Comparison Program\nby Atul Butte\n" ); fprintf( outFile, "Copyright (c) 1992-1993 by Atul Butte and Paolo DePetrillo\n\n" ); if( argc == 3 ) { inFile = fopen( argv[1], "r" ); if( inFile == NULL ) { printf( "File \"%d\" not found.\n", argv[1] ); exit( 1 ); } strcpy( filename, argv[1] ); } else { do { printf( "Please enter the file name for the first gene sequence:\n" ); scanf( "%[^\n]%*c", filename ); inFile = fopen( filename, "r" ); if( inFile == NULL ) printf( "File not found. Please try again.\n" ); } while( inFile == NULL ); } fseek( inFile, 0L, SEEK_END ); memneed = sizeof( BASE ) * ( ftell( inFile ) + 1 ); seq1 = malloc( memneed ); if( seq1 == NULL ) { printf( "Need more memory to hold sequence \"%d\"\n", filename ); printf( "Try increasing this program's memory by %dK\n", memneed / 1024 ); exit( 1 ); } rewind( inFile ); fscanf( inFile, "%[^@]", seq1 ); fclose( inFile ); fprintf( outFile, "Comparing \"%s\" and ", filename ); if( argc == 3 ) { inFile = fopen( argv[2], "r" ); if( inFile == NULL ) { printf( "File \"%d\" not found.\n", argv[2] ); exit( 1 ); } strcpy( filename, argv[2] ); } else { do { printf( "Please enter the file name for the second gene sequence:\n" ); scanf( "%[^\n]%*c", filename ); inFile = fopen( filename, "r" ); if( inFile == NULL ) printf( "File not found. Please try again.\n" ); } while( inFile == NULL ); } fseek( inFile, 0L, SEEK_END ); memneed = sizeof( BASE ) * ( ftell( inFile ) + 1 ); seq2 = malloc( memneed ); if( seq2 == NULL ) { printf( "Need more memory to hold sequence \"%d\"\n", filename ); printf( "Try increasing this program's memory by %dK\n", memneed / 1024 ); exit( 1 ); } rewind( inFile ); fscanf( inFile, "%[^@]", seq2 ); fclose( inFile ); if( argc == 3 ) printf( "Comparing \"%s\" and \"%s\"\n", argv[1], argv[2] ); fprintf( outFile, "\"%s\"\n", filename ); fprintf( outFile, "\nshift\tc\tD\tmax w\tcnt\twhere\n" ); printf( "\nWorking...\n" ); fflush( stdout ); } clean_files( ) { BASE * seq1index; BASE * seq2index; BASE base; seq1index = seq1; seq2index = seq1; while( (base = *seq1index) != 0 ) { seq1index++; base = toupper( base ); if( base == 'A' || base == 'G' || base == 'C' || base == 'T' ) { *seq2index = base; seq2index++; } } *seq2index = 0; seq1index = seq2; seq2index = seq2; while( (base = *seq1index) != 0 ) { seq1index++; base = toupper( base ); if( base == 'A' || base == 'G' || base == 'C' || base == 'T' ) { *seq2index = base; seq2index++; } } *seq2index = 0; } main( int argc, char* argv[] ) { int seq1truelength; int seq2truelength; int * compareIndex; int compareSize; BASE * seq1index; BASE * seq2index; int seq1length; int seq2length; int currentMatches; int maxLengthMatches; BASE * maxIndex1; BASE * maxIndex2; int i; int frameShift; int frameShiftRange = 0; double slope; double sum_slope = 0.0; double sum_slope_squared = 0.0; double sum_slope_by_codon[3] = { 0.0, 0.0, 0.0 }; int count_each_slope[3] = { 0, 0, 0 }; int count_no_slope[3] = { 0, 0, 0 }; char valid; int codon; store * store_ptr1; store * store_ptr2; int memneed; int count_p_tiles[5] = { 0, 0, 0, 0, 0 }; double p_tiles[5] = { 0.05, 0.10, 0.15, 0.20, 0.25 }; double mean_D; double sd_D; atexit( &exit_cleanup ); read_files( argc, argv ); clean_files( ); seq1truelength = strlen( seq1 ); seq2truelength = strlen( seq2 ); compareSize = MIN( seq1truelength, seq2truelength ); memneed = sizeof( *compare ) * compareSize; compare = malloc( memneed ); if( compare == NULL ) { printf( "Need more memory to hold comparison vector\n" ); printf( "Try increasing this program's memory by %dK\n", memneed / 1024 ); exit( 1 ); } memneed = sizeof( *store_array ) * ( seq1truelength + seq2truelength - 190 ); store_array = malloc( memneed ); if( store_array == NULL ) { printf( "Need more memory to hold array of D values\n" ); printf( "Try increasing this program's memory by %dK\n", memneed / 1024 ); exit( 1 ); } for( frameShift = - (seq2truelength - 101); frameShift <= seq1truelength - 101; frameShift++ ) { if( frameShift < 0 ) { seq1index = seq1; seq2index = &seq2[-frameShift]; } else { seq1index = &seq1[frameShift]; seq2index = seq2; } seq1length = strlen( seq1index ); seq2length = strlen( seq2index ); compareIndex = compare; for( i = 0; i < compareSize; i++ ) { *compareIndex = 0; compareIndex++; } maxLengthMatches = 0; currentMatches = 0; for( i = 0; i < MIN( seq1length, seq2length ); i++ ) { if( *seq1index == *seq2index ) { currentMatches++; } else if( currentMatches > 0 ) { compare[currentMatches]++; if( currentMatches > maxLengthMatches ) { maxLengthMatches = currentMatches; maxIndex1 = seq1index; maxIndex2 = seq2index; } currentMatches = 0; } seq1index++; seq2index++; } if( currentMatches > 0 ) { compare[currentMatches]++; if( currentMatches > maxLengthMatches ) maxLengthMatches = currentMatches; currentMatches = 0; } valid = 0; slope = 1.0 - get_slope( maxLengthMatches, compare, &valid ); codon = mod_codon( frameShift ); if( valid == 0 ) { count_no_slope[codon]++; slope = -99999; } else { sum_slope += slope; sum_slope_squared += slope * slope; sum_slope_by_codon[codon] += slope; count_each_slope[codon] ++; } store_array[frameShiftRange].frameShift = frameShift; store_array[frameShiftRange].maxLengthMatches = maxLengthMatches; store_array[frameShiftRange].compareAtMax = compare[maxLengthMatches]; store_array[frameShiftRange].where1 = maxIndex1 - seq1 + 1 - maxLengthMatches; store_array[frameShiftRange].where2 = maxIndex2 - seq2 + 1 - maxLengthMatches; store_array[frameShiftRange].length = MIN( seq1length, seq2length ); store_array[frameShiftRange].D = slope; frameShiftRange++; } printf( "Sorting...\n" ); fflush( stdout ); for( i = 1; i < frameShiftRange; i++ ) { store_ptr1 = &store_array[i-1]; store_ptr2 = &store_array[i]; store_temp = store_array[i]; if( store_temp.D < store_ptr1->D ) { while( store_ptr1 >= store_array && store_temp.D < store_ptr1->D ) { *store_ptr2 = *store_ptr1; store_ptr2--; store_ptr1--; } if( store_ptr2 >= store_array ) { *store_ptr2 = store_temp; } else { store_array[0] = store_temp; } } } store_ptr1 = store_array; for( i = 0; i < frameShiftRange; i++ ) { if( store_ptr1->maxLengthMatches == 1 ) fprintf( outFile, "%d\t%d\t*****\t%d\t%d\t(@%d-%d,%d-%d)\t%d\n", store_ptr1->frameShift, mod_codon(store_ptr1->frameShift), store_ptr1->maxLengthMatches, store_ptr1->compareAtMax, store_ptr1->where1, store_ptr1->where1 + store_ptr1->maxLengthMatches, store_ptr1->where2, store_ptr1->where2 + store_ptr1->maxLengthMatches, store_ptr1->length ); else fprintf( outFile, "%d\t%d\t%f\t%d\t%d\t(@%d-%d,%d-%d)\t%d\n", store_ptr1->frameShift, mod_codon(store_ptr1->frameShift), store_ptr1->D, store_ptr1->maxLengthMatches, store_ptr1->compareAtMax, store_ptr1->where1, store_ptr1->where1 + store_ptr1->maxLengthMatches, store_ptr1->where2, store_ptr1->where2 + store_ptr1->maxLengthMatches, store_ptr1->length ); store_ptr1++; } fflush( outFile ); mean_D = (double) sum_slope / frameShiftRange; sd_D = (double) sqrt( ( sum_slope_squared - sum_slope * sum_slope / frameShiftRange ) / (frameShiftRange - 1 ) ) ; printf( "\nmean D %f, sd %f\n", mean_D, sd_D ); fprintf( outFile, "\nmean D %f, sd %f\n", mean_D, sd_D ); for( i = 0; i < 3; i++ ) { fprintf( stdout, "mean D[%d] %f, %d with no slope\n", i, (double) sum_slope_by_codon[i] / count_each_slope[i], count_no_slope[i] ); fprintf( outFile, "mean D[%d] %f, %d with no slope\n", i, (double) sum_slope_by_codon[i] / count_each_slope[i], count_no_slope[i] ); } { int count2; double standard_score; store_ptr1 = store_array; count2 = 0; for( i = 0; i < frameShiftRange; i++ ) { double p; int j; if( store_ptr1->maxLengthMatches > 1 ) { standard_score = ( (double) store_ptr1->D - mean_D ) / sd_D; if( standard_score < 0 ) { count2++; p = (double) 0.4444444 / (standard_score * standard_score); for( j = 0; j < 5; j++ ) { if( p <= p_tiles[j] ) count_p_tiles[j]++; } } } store_ptr1++; } fprintf( stdout, "\np <=\tcount\tpercentage\n" ); fprintf( outFile, "\np <=\tcount\tpercentage\n" ); for( i = 0; i < 5; i++ ) { fprintf( stdout, "%4.2f\t%d\t%f%\n", p_tiles[i], count_p_tiles[i], (double) 100.0 * count_p_tiles[i] / count2 ); fprintf( outFile, "%4.2f\t%d\t%f%\n", p_tiles[i], count_p_tiles[i], (double) 100.0 * count_p_tiles[i] / count2 ); } fprintf( stdout, "N\t%d\n", count2 ); fprintf( outFile, "N\t%d\n", count2 ); } { int count1; int count2; double minDinRange; double maxDinRange; int j; fprintf( stdout, "\nDz >=\tand <\tcount\n" ); fprintf( outFile, "\nDz >=\tand <\tcount\n" ); count1 = 0; count2 = 0; minDinRange = mean_D + -6 * sd_D; store_ptr1 = store_array; for( j = 0; j < frameShiftRange; j++ ) { if( store_ptr1->maxLengthMatches > 1 ) { if( store_ptr1->D < minDinRange ) count2++; } else count1++; store_ptr1++; } fprintf( stdout, "INF\t\t%d\n", count1 ); fprintf( outFile, "INF\t\t%d\n", count1 ); fprintf( stdout, "\t%d\t%d\n", -6, count2 ); fprintf( outFile, "\t%d\t%d\n", -6, count2 ); for( i = -6; i < 7; i++ ) { minDinRange = mean_D + i * sd_D; maxDinRange = mean_D + (i + 1) * sd_D; count2 = 0; store_ptr1 = store_array; for( j = 0; j < frameShiftRange; j++ ) { if( store_ptr1->maxLengthMatches > 1 ) { if( store_ptr1->D < minDinRange ); else if( store_ptr1->D >= maxDinRange ) break; else if( store_ptr1->D >= minDinRange && store_ptr1->D < maxDinRange ) { count2++; } } store_ptr1++; } fprintf( stdout, "%d\t%d\t%d\n", i, i+1, count2 ); fprintf( outFile, "%d\t%d\t%d\n", i, i+1, count2 ); } count1 = 0; store_ptr1 = store_array; for( j = 0; j < frameShiftRange; j++ ) { if( store_ptr1->maxLengthMatches > 1 ) { if( store_ptr1->D >= maxDinRange ) count1++; } store_ptr1++; } fprintf( stdout, "%d\t\t%d\n", 7, count1 ); fprintf( outFile, "%d\t\t%d\n", 7, count1 ); fprintf( stdout, "\nTOT\t\t%d\n", frameShiftRange ); fprintf( outFile, "\nTOT\t\t%d\n", frameShiftRange ); } fclose( outFile ); }