#!/usr/bin/env python ''' Created on Sep 13, 2010 @author: Lance Parsons Requirements: pysam (pyrex, cython, samtools) biopython LICENSE Copyright (c) 2011, Lance Parsons All rights reserved. This script is licensed by the Simplified BSD License See LICENSE.TXT and ''' from __future__ import division from Bio import SeqIO from Bio.SeqUtils import GC from optparse import OptionParser import pysam import sys import os def main(): usage = ''' %prog [options] fasta_file bam_file repeat_interval_file gc_coverage_file > new_fasta_file new fasta file is output to STDOUT log messages are output to STDERR Arguments: 1) fasta_file - fasta file to be expanded 2) bam_file - bam file of reads mapped to sequences in fasta file (used to determine current coverage) 3) repeat_interval_file - tab-delimited with three columns: [sequence_id] [interval_start] [interval_end] start and end are 1-based coordinates 4) gc_coverage_file - tab-delimited: [min_gc_fraction] [max_gc_fraction] [median_coverage] Intervals are inclusive of min, exclusive of max [min, max)''' # Parse Arguments parser = OptionParser(usage=usage) (opts, args) = parser.parse_args() if len(args) < 4: parser.print_help() sys.exit() fasta_file = args[0] bam_file = args[1] repeat_interval_file = args[2] gc_coverage_file = args[3] # Open input files intervals = open(repeat_interval_file) orig_sequence_dict = SeqIO.index(fasta_file, "fasta") handle = open(fasta_file, "rU") new_sequence_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) handle.close() # Initialize variables offset = {} # Loop through intervals file new_intervals = [] for interval in intervals: (chrom, start, end) = interval.rstrip().split("\t") start = int(start) end = int(end) s = start - 1 if chrom not in offset: offset[chrom] = 0 new_s = s + offset[chrom] new_end = end + offset[chrom] # Determine original avg coverage avg_coverage = coverage(chrom, s, end, bam_file) sys.stderr.write("\nOriginal Interval - Chrom: %s\tStart: %i\tEnd: %i\tOffset: %i\n" % (chrom, start, end, offset[chrom])) sys.stderr.write('Average coverage: %.2f\n' % avg_coverage) # Get oligo sequence from orig sequence orig_seq_record = orig_sequence_dict[chrom] olgio_record = orig_seq_record[s:end] oligo_seq = olgio_record.seq sys.stderr.write("Oligo: %s\n" % oligo_seq) # Determine GC percent of oligo gc_percent = GC(oligo_seq) sys.stderr.write("GC Percent: %.2f\n" % gc_percent) # Predicted Coverage predicted_coverage = coverage_for_gc(gc_percent / 100, gc_coverage_file) # Predicted Copies copies = determine_copies(avg_coverage, predicted_coverage) sys.stderr.write("Predicted Coverage: %.2f\tNumber of Copies: %i\n" % (predicted_coverage, copies)) # Replace sequence new_sequence_record = new_sequence_dict[chrom] new_sequence = new_sequence_record.seq new_sequence = replace_subsequence(new_sequence, new_s, new_end, oligo_seq, copies) new_sequence_record.seq = new_sequence.toseq() new_sequence_dict[chrom] = new_sequence_record sys.stderr.write("New Interval - Chrom: %s\tStart: %i\tEnd: %i\n" % (chrom, new_s+1, new_end)) new_intervals.append([chrom, new_s+1, new_end]) # Update offset offset[chrom] = offset[chrom] + len(oligo_seq) * (copies - 1) # Print fasta to stdout for seqid in new_sequence_dict.iterkeys(): sys.stdout.write(new_sequence_dict[seqid].format("fasta")) sys.stderr.write("\n\nSequence %s\tOrig length: %i\tNew Length: %i\n" % (seqid, len(orig_sequence_dict[seqid].seq), len(new_sequence_dict[seqid].seq))) # Print new intervals to stderr #print new_intervals.size() sys.stderr.write("\nNew Intervals\n") for interval in new_intervals: sys.stderr.write("%s\t%i\t%i\n" % (interval[0], interval[1], interval[2])) sys.stderr.write("\n") def coverage(chrom, start, end, bam_file): ''' Returns average coverage for specified region in bam_file Reads bam_file, indexes if necessary ''' bam_index = "%s.bai" % bam_file if (not os.path.isfile(bam_index)): sys.stderr.write("Generating index for bamfile: %s" % bam_file) pysam.index(bam_file) samfile = pysam.Samfile(bam_file, "rb") tot_cov = 0 num_pos = 0 for pileupcolumn in samfile.pileup(chrom, start, end): if (pileupcolumn.pos >= start and pileupcolumn.pos < end): num_pos += 1 tot_cov += pileupcolumn.n samfile.close() avg_cov = tot_cov / num_pos return avg_cov def coverage_for_gc(gc_content, gc_coverage_file): ''' Read tab delimited file with three columns: min_gc_fraction max_gc_fraction coverage Intervals are inclusive of min, exclusive of max [min, max) ''' gc_coverage = open(gc_coverage_file) for line in gc_coverage: (start, end, coverage) = line.rstrip().split('\t') if (gc_content >= float(start) and gc_content < float(end)): return int(coverage) raise Exception('Coverage not found in file') def determine_copies(actual_coverage, predicted_coverage): ''' Returns number of predicted copies given actual and predicted coverage predicted = (actual/predicted) rounded to nearest integer ''' return max(1,int(round(actual_coverage / predicted_coverage))) def replace_subsequence(sequence, start, end, oligo, copies): ''' Replaces specified subsequence (BioPython Seq obj) in sequence with specified copies of oligo Returns the updated sequence ''' new_sequence = sequence.tomutable() sys.stderr.write("Oligo : %s\n" % oligo) sys.stderr.write("Replaced oligo : %s\n" % new_sequence[start:end]) replacement_oligo = str(oligo) * copies sys.stderr.write("Replacement Oligo: %s\n" % replacement_oligo) new_sequence[start:end] = replacement_oligo return new_sequence if __name__ == '__main__': main() sys.exit()