Here is a script to do this. I happened to have recently made the main tool required (code to find the "mask" in genomic coordinates of all coding sequences of all isoforms of a gene), and it was still a little messy to throw together. Attempt to read the code at your own risk.
Give the script an ENSG(d+) stable id at the command line and get ENSG${1}_aligned_isoforms.fasta out. This is essentially instant on a local installation of the Ensembl database but takes ~30 seconds if connecting to ensembldb.ensembl.org. If you need to do this for more than a few genes, you will want to install locally.
#!/usr/bin/perl
use Bio::EnsEMBL::Registry;
use warnings;
my $registry = 'Bio::EnsEMBL::Registry';
$registry->load_registry_from_db(
-host => 'ensembldb.ensembl.org',
-user => 'anonymous',
-port => 5306
);
my $gene_name = shift or die "Give an ENSG at the command line";
# Set up database adaptors
my $gene_adaptor = $registry->get_adaptor("human", "core", "gene");
my $transcript_adaptor = $registry->get_adaptor("human", "core", "transcript");
my $gene = $gene_adaptor->fetch_by_stable_id($gene_name);
open(OUTPUT, ">" . $gene->stable_id . "_aligned_isoforms.fasta");
my $strand = $gene->strand;
# Build the combined set of intervals in genomic coordinates that all isoforms' coding sequences cover
my ($left_endpoints, $right_endpoints) = get_combined_coding_regions($gene);
# For convenience, compute the partial sums of the lengths of these intervals in the strand-appropriate direction
$partial_lengths = partial_lengths($left_endpoints, $right_endpoints, $strand);
my $transcripts = $gene->get_all_Transcripts;
for my $transcript (@$transcripts) {
my $has_coding_region = 0;
# Initialize this isoform's sequence to all gap
my $sequence = '-' x $partial_lengths->[@$partial_lengths - 1];
my $exons = $transcript->get_all_Exons;
for my $exon (@$exons) {
my $coding_region_start = $exon->coding_region_start($transcript);
my $coding_region_end = $exon->coding_region_end($transcript);
if ($coding_region_start) {
$has_coding_region = 1;
my $length = $coding_region_end - $coding_region_start + 1;
# Find the left edge of this exon's coding sequence in the alignment's coordinate system
my $aligned_left = slice_coords_to_aligned_coords($coding_region_start, $coding_region_end, $left_endpoints, $right_endpoints, $partial_lengths, $strand);
# Fill in the appropriate region of this isoform's aligned sequence with this exon's coding sequence
substr($sequence, $aligned_left, $length) = $exon->slice->subseq($coding_region_start, $coding_region_end, $strand);
}
}
if ($has_coding_region) {
print OUTPUT ">" . $gene->stable_id . "-" . $transcript->stable_id . "\n";
print OUTPUT $sequence;
print OUTPUT "\n";
}
}
sub get_combined_coding_regions {
my $gene = shift @_;
my $transcripts = $gene->get_all_Transcripts;
my @left_endpoints;
my @right_endpoints;
my $slice = undef;
my $strand = undef;
my $sequence = "";
for my $transcript (@$transcripts) {
my $exons = $transcript->get_all_Exons;
for my $exon (@$exons) {
$slice = $exon->slice unless ($slice);
$strand = $exon->strand unless ($strand);
my $crs = $exon->coding_region_start($transcript);
my $cre = $exon->coding_region_end($transcript);
if ($crs) {
insert_endpoints($crs, $cre, \@left_endpoints, \@right_endpoints);
}
}
}
return (\@left_endpoints, \@right_endpoints);
}
sub insert_endpoints {
my ($new_left, $new_right, $left_endpoints, $right_endpoints) = @_;
my $number_existing = @$left_endpoints;
my ($new_left_region, $new_right_region);
my ($new_left_in, $new_right_in) = (0, 0);
if ($number_existing == 0) {
push @$left_endpoints, $new_left;
push @$right_endpoints, $new_right;
return;
}
if ($new_right < $left_endpoints->[0]) {
$new_left_region = $new_right_region = 0;
} elsif ($new_left > $right_endpoints->[$number_existing - 1]) {
$new_left_region = $new_right_region = $number_existing;
} else {
my $i = 0;
while ($right_endpoints->[$i] < $new_left) {
$i++;
}
$new_left_region = $i;
if ($left_endpoints->[$i] <= $new_left) {
$new_left_in = 1;
}
$i = $number_existing - 1;
while ($left_endpoints->[$i] > $new_right) {
$i--;
}
$new_right_region = $i;
if ($right_endpoints->[$i] >= $new_right) {
$new_right_in = 1;
}
}
my $length = $new_right_region - $new_left_region + ($new_left_region != $new_right_region || $new_left_in || $new_right_in ? 1 : 0);
splice @$left_endpoints, $new_left_region, $length, ($new_left_in ? $left_endpoints->[$new_left_region] : $new_left);
splice @$right_endpoints, $new_left_region, $length, ($new_right_in ? $right_endpoints->[$new_right_region] : $new_right);
}
sub partial_lengths {
my ($left_endpoints, $right_endpoints, $strand) = @_;
my @partials = (0);
my $length = 0;
if ($strand == 1) {
for (my $j = 0; $j <= @$left_endpoints - 1; $j++) {
$length += $right_endpoints->[$j] - $left_endpoints->[$j] + 1;
push @partials, $length;
}
} else {
for (my $j = @$left_endpoints - 1; $j >= 0; $j--) {
$length += $right_endpoints->[$j] - $left_endpoints->[$j] + 1;
push @partials, $length;
}
}
return \@partials;
}
sub slice_coords_to_aligned_coords {
my ($left, $right, $left_endpoints, $right_endpoints, $partial_lengths, $strand) = @_;
my $index_from_front = 0;
my $index_from_back = @$left_endpoints - 1;
if ($strand == 1) {
while ($right_endpoints->[$index_from_front] < $right) {
$index_from_front++;
}
return ($left - $left_endpoints->[$index_from_front]) + $partial_lengths->[$index_from_front];
} else {
while ($left_endpoints->[$index_from_back] > $left) {
$index_from_front++;
$index_from_back--;
}
return ($right_endpoints->[$index_from_back] - $right) + $partial_lengths->[$index_from_front];
}
}
precisely what I needed. thx!