#!/usr/bin/perl use strict; use warnings; use Getopt::Long; use File::Which; ################################################################################ # Author : Isabelle GUIGON # Date : 2014-11-27 # This script converts a BAM file into a BED file for use by miRkwood. # # Dependencies : samtools, perl module File::Which # Make sure to have it installed in your PATH. # For Ubuntu/Debian distributions `sudo apt-get install samtools` is enough. ################################################################################ ########## Variables my $input_file = ''; my $bed_file = ''; my $min_length = 18; my $max_length = 25; my $time = time(); my $sorted_bam_file = "/tmp/mirkwood_bam2bed_${time}_sorted"; my $sorted_sam_file = "/tmp/mirkwood_bam2bed_${time}_sorted.sam"; my $counts; my $help; my $help_message = <<'EOF'; mirkwood-bam2bed.pl ---------- Script to convert a BAM/SAM file into a BED file for use by miRkwood. Usage : ./mirkwood-bam2bed.pl -in -bed Options : -in : input BAM or SAM -bed : output BED -min : keep only reads with length >= min (default 18) -max : keep only reads with length <= max (default 25) -help : display this message and quit Dependencies : samtools Make sure to have it installed in your PATH. For Ubuntu/Debian distributions `sudo apt-get install samtools` is enough. EOF ########## Check that the samtools are installed if ( ! which( 'samtools' ) ){ die "ERROR: samtools are missing. Please install them in your PATH.\n"; } ########## Get options GetOptions ('in=s' => \$input_file, 'bed=s' => \$bed_file, 'min=s' => \$min_length, 'max=s' => \$max_length, 'help' => \$help); ########## Validate options if ( $help ){ print $help_message; exit; } if ( ( ! -r $input_file ) || ( $bed_file eq '' ) ){ die "Missing parameter!\n" . $help_message; } if ( $min_length >= $max_length ){ die "--min should be strictly lower than --max\n" . $help_message; } ########## Create sorted SAM file if ( $input_file =~ /([^\/\\]+)[.]sam/ ){ system("sort -k 3,3 -k 4,4n $input_file | grep -v \"^@\" > $sorted_sam_file"); } elsif ( $input_file =~ /([^\/\\]+)[.](bam|dat)/ ){ ##### Sort BAM file system("samtools sort $input_file $sorted_bam_file"); ##### Convert sorted BAM into SAM and filter out unmapped reads system("samtools view -F 4 $sorted_bam_file.bam > $sorted_sam_file"); unlink $sorted_bam_file . '.bam'; } else{ die "Non correct input file. We accept BAM and SAM as input formats.\n$help_message"; } ########## Read the SAM file to store counts into a hash open(my $SAM, '<', $sorted_sam_file) or die "ERROR while reading SAM file. Program will end prematurely.\n"; while ( <$SAM> ){ chomp; my @line = split ( /\t/smx ); if ( $line[1] eq '0x4' || $line[1] eq '4' ){ next; } if ( length($line[9]) < $min_length || length($line[9]) > $max_length ){ next; } my $chromosome = $line[2]; my $start = $line[3] - 1; my $sequence = $line[9]; my $strand = '+'; if ( $line[1] eq '16' or $line[1] eq '0x10' ){ $strand = '-'; } if ( ! exists( $counts->{$chromosome}{$start}{$sequence}{$strand} ) ){ $counts->{$chromosome}{$start}{$sequence}{$strand} = 0; } $counts->{$chromosome}{$start}{$sequence}{$strand}++; } close $SAM; unlink $sorted_sam_file; ########## Browse hash tables and print data in BED file open(my $BED, '>', $bed_file) or die "ERROR while creating $bed_file. Program will end prematurely.\n"; foreach my $chromosome ( sort ( keys%{$counts} ) ){ foreach my $start ( sort {$a <=> $b} keys%{ $counts->{$chromosome} } ){ foreach my $sequence ( sort (keys%{ $counts->{$chromosome}{$start} } ) ){ foreach my $strand ( sort (keys%{ $counts->{$chromosome}{$start}{$sequence} } ) ){ my $end = $start + length($sequence); print $BED "$chromosome\t"; print $BED "$start\t"; print $BED "$end\t"; print $BED "$sequence\t"; print $BED "$counts->{$chromosome}{$start}{$sequence}{$strand}\t"; print $BED "$strand\n"; } } } } close $BED; my $total_time = time() - $time; my $day = int( $total_time / 86_400 ); my $hour = int( ($total_time % 86_400 ) / 3_600 ); my $min = int( ( ($total_time % 86_400 ) % 3_600 ) / 60 ); my $sec = int( ( ($total_time % 86_400 ) % 3_600 ) % 60 ); print "Done in $day day $hour h $min min $sec sec.\n";