from intervaltree import IntervalTree
from .ensemblclient import EnsemblClient
from .utils import region_str
[docs]class AssemblyMapper:
"""Structure that optimizes the mapping between
diferent genome assemblies.
Example::
>>> mapper = AssemblyMapper(from_assembly='GRCh37'
... to_assembly='GRCh38')
>>> mapper.map(chrom='1', pos=1000000)
1064620
"""
# No need to request several times the same info
map_cache = dict()
def __init__(self, from_assembly='GRCh37',
to_assembly='GRCh38',
species='human'):
"""
Parameters
----------
from_assembly: str, optional
Version of the input assembly. Default 'GRCh37'.
to_assembly: str, optional
Version of the output assembly. Default 'GRCh38'.
species: str, optional
Species name/alias. Default 'human'.
"""
# The interface to the Ensembl REST API
self.client = EnsemblClient()
# The assembly mapping is formed by mappings
# of genome slices. We keep at minimum the
# number of requests we make by fetching the
# whole mapping information at once and storing
# it in an interval tree data structure for
# efficient lookup.
self.assembly_map = self._fetch_mapping(from_assembly,
to_assembly,
species)
# ---
def _fetch_mapping(self, from_assembly, to_assembly, species):
"""Fetch the mapping, either make it anew or use a cached one."""
map_key = from_assembly + to_assembly + species
if map_key in self.map_cache:
# Mapping information already fetched, return it
return self.map_cache[map_key]
else:
# Need to assemble the mapping information
mapping = self._assemble_mapping(from_assembly,
to_assembly,
species)
# Cache the assembled mapping
self.__class__.map_cache[map_key] = mapping
return mapping
# ---
def _assemble_mapping(self, from_assembly, to_assembly, species):
"""Assemble the interval tree with the information
for the given mapping.
"""
client = self.client
# Get the chromosome sizes
genome_info = client.assembly_info(species=species)
chromosomes = {item['name']: item['length']
for item in genome_info['top_level_region']
if item['coord_system'] == 'chromosome'}
# Initialize the mapping structure
# (An interval tree for each chromosome)
assembly_map = dict()
for chrom, chrom_len in chromosomes.items():
# Query the mapping information
# for the entire chromosome
region = region_str(chrom, start=1, end=chrom_len)
map_ = client.assembly_map(species=species,
region=region,
asm_one=from_assembly,
asm_two=to_assembly)
# Assemble into the mapping structure
assembly_map[chrom] = self._interval_tree(map_['mappings'],
chrom_len)
return assembly_map
# ---
def _interval_tree(self, mappings, chrom_length):
"""Assemble an interval tree from the mapping information.
An interval tree is a tree data structure that allows to
efficiently find all intervals that overlap with any given
interval or point, often used for windowing queries.
(see https://en.wikipedia.org/wiki/Interval_tree)
The mapping information is a collection where
each item has the shape::
{'mapped':
{'assembly': 'GRCh38',
'coord_system': 'chromosome',
'end': 1039365,
'seq_region_name': 'X',
'start': 1039265,
'strand': 1},
'original':
{'assembly': 'GRCh37',
'coord_system': 'chromosome',
'end': 1000100,
'seq_region_name': 'X',
'start': 1000000,
'strand': 1}}
"""
interval_tree = IntervalTree()
for item in mappings:
# Assemble the interval tree.
# Each item describes a mapping of
# regions btw both assemblies.
from_ = item['original']
to = item['mapped']
# Need to modify to represent a half open
# interval (as [a,b) instead of [a,b])
from_region = from_['start'],from_['end']+1
if to['strand'] == +1:
to_region = to['start'],to['end']
else:
# Handle mappings to the reverse strand
# (Translate them to the forward strand)
# Visual aid to the transformation:
# 1 2 3 4 5 6 7 8 9 10
# | | | | | | | | | |
# 10 9 8 7 6 5 4 3 2 1
to_region = (chrom_length - to['end'] + 1,
chrom_length - to['start'] + 1)
interval_tree.addi(*from_region,
data=to_region)
return interval_tree
# ---
[docs] def map(self, chrom, pos):
"""Map the given position.
The mapping is between the specified assemblies
when creating the object. (default: map position
from assembly GRCh37 to assembly GRCh38)
"""
# Query the interval it maps to
chromosome_mapper = self.assembly_map[str(chrom)]
interval = chromosome_mapper[pos]
if interval:
# Calculate position from the interval
interval = interval.pop()
mapped_pos = interval.data[0] + (pos - interval.begin)
else:
# No mapping found.
mapped_pos = None
return mapped_pos
# ---
# AssemblyMapper