#!/usr/bin/env python
# -*- coding: utf-8 -*-:
"""
Copyright © Bonsai - LIFL (Université Lille 1, CNRS UMR 8022) and Inria-Lille Nord Europe
contact: aida.ouangraoua@inria.fr, amandine.perrin@inria.fr
This software is a computer program whose purpose is to progressively reconstruct ancestral
gene orders.
This software is governed by the CeCILL-B license under French law and
abiding by the rules of distribution of free software. You can use,
modify and/or redistribute the software under the terms of the CeCILL-B
license as circulated by CEA, CNRS and Inria at the following URL
http://www.cecill.info, or in the LICENCE file at the root directory of this program.
The fact that you are presently reading this means that you have had
knowledge of the CeCILL-B license and that you accept its terms.
---------------------------------------------------
``initial_files_reader`` **module description**:
This module contains some methods used to read and parse the initial files:
* Phylogeny tree file (where a node is marked as the ancestor: @)
* Blocks file (with block order of each genome of the tree)
.. moduleauthor:: Aïda Ouangraoua, Amandine PERRIN
May 2014
"""
import re
[docs]def read_treefile(tree_file_name):
""" Function reading the tree structure from a file
Parameters
----------
tree_file_name : String
Name of file containing species tree
Returns
-------
tuple
*tree:* dict, tree structure:
| {node_id: [node_name, parent_id, left_child_id, right_child_id], ...}
| where *id* = -1 if no *parent/left_child/right_child*
| and *node_name* = "N" + *node_id* for internal nodes, or *leaf_name* for leaf nodes
*ancestor_id:* int, id of ancestor node in tree
"""
ancestor_id = -1 # integer, identifier of ancestor node to be reconstructed
tree = {} # a map for the tree structure (see docstring hereabove)
with open(tree_file_name, "r") as treefile:
treeseq = treefile.readline()
cur_id = 0 # ids to be associated to nodes
pile = [] # pile used to remember ancestor nodes
pos = 0 # integer containing successive positions in the string treeseq
while pos < len(treeseq):
# start to read a new internal node
if treeseq[pos] == "(":
cur_id += 1
tree[cur_id] = ["N" + str(cur_id), -1, -1, -1]
# if node has a parent
if len(pile) > 0:
# Add parent ID in the current node array (index 1)
tree[cur_id][1] = pile[-1]
# Add current node as a child to his parent node.
if(tree[pile[-1]][2] == -1):
tree[pile[-1]][2] = cur_id
else:
tree[pile[-1]][3] = cur_id
# add node on top of the pile
pile.append(cur_id)
pos += 1
# end of node
elif treeseq[pos] == ")":
pos += 1
# if node is the ancestor node to be reconstructed, save its ID
if treeseq[pos] == "@":
ancestor_id = pile[-1]
pos += 1
# check if end of line is reached
if treeseq[pos] == ";" or treeseq[pos] == "\n" or treeseq == " ":
pos += 2
del pile[-1]
# new sibling
elif treeseq[pos] == ",":
pos += 1
# leaf node
else:
# Give node a new ID
cur_id += 1
tree[cur_id] = ["", -1, -1, -1]
# if node as a parent
if len(pile) > 0:
# add parent at position 1 in array
tree[cur_id][1] = pile[-1]
# add current node id in children position of parent array
if(tree[pile[-1]][2] == -1):
tree[pile[-1]][2] = cur_id
else:
tree[pile[-1]][3] = cur_id
# add leaf species name at position 0 in the array
begin = pos
while(pos < len(treeseq) and treeseq[pos] != "," and treeseq[pos] != ")"):
pos += 1
nom = treeseq[begin: pos]
nom = "_".join(nom.split()) # Remove spaces in leaf name if there are
tree[cur_id][0] = nom
return tree, ancestor_id
[docs]def read_block_file(block_file_name, nb_species, spe_ids):
""" Method reading blocks from a block file
Parameters
----------
block_file_name : String
Name of file containing the ordered blocks for all species
nb_species : int
Total number of species
spe_ids : dict
Species and their corresponding ID {spe1: id1, spe2: id2, ...}
Returns
-------
list
*blocks:* array of arrays (blocks)
| blocks = [[*list of blocks for spe_id=0*], [*list of blocks for spe_id=1*], ...]
| with *list of blocks for spe_id=1* = [[*list of blocks for spe_id=1, chromosome 1*],
[*list of blocks for spe_id=1, chromosome 2*],
...
]
| with *list of blocks for spe_id=1, chromosome 1* = [[start, end, num_bloc, orient],
[start, end, num_bloc, orient],
... for the n blocks
]
*nb_blocks:* int, total number of blocks
"""
blocks = [[] for _ in xrange(nb_species)] # array containing arrays (one for each species)
with open(block_file_name, "r") as blocfile:
chm = {} # map : chr_name -> integer_id
chm['X'] = 0
bloc_id = 0 # Current bloc id
for line in blocfile:
chm, bloc_id = read_block_line(line, chm, spe_ids, blocks, bloc_id)
nb_blocks = bloc_id
return blocks, nb_blocks
[docs]def read_block_line(line, chm, spe_ids, blocks, bloc_id):
""" Reading one line of a block file, in order to return a new block number, or
the positions of a block in a species chromosome.
Parameters
----------
line : String
A line of the block file
chm : dict
Chromosome IDs : *{chromo: id, chromo2: id2, ...}*
spe_ids : dict
Species and their corresponding ID *{spe1: id1, spe2: id2, ...}*
blocks : list
Array of blocks already read, to complete
bloc_id : int
Current bloc id
Returns
-------
list
*chm:* dict, chromosome IDs : *{chromo: id, chromo2: id2, ...}*
*bloc_id:* int, current bloc id
"""
# New block
if line[0] == '>':
# find line index of '>' (if not first caracter)
end = [m.end() for m in re.finditer('>', line)]
bloc_id = int(line[end[0]:].strip())
# Get block positions for one species
elif len(line.strip()) > 0:
spe = line.strip().split(".")[0]
# if the species exists in our list of species ids, add to block information
if spe in spe_ids:
chromo = line.strip().split(".")[-1].split(":")[0]
start = int(line.split(":")[-1].split("-")[0].strip())
end = int(line.split("-")[1].split()[0].strip())
orient = line.strip().split()[-1]
orient = 1 if orient == "+" else -1
if not chromo.upper() in chm:
chm[chromo.upper()] = max(chm.values()) + 1
if (chm[chromo.upper()] >= len(blocks[spe_ids[spe]])):
nb_chr_add = chm[chromo.upper()] - len(blocks[spe_ids[spe]]) + 1
for _ in xrange(nb_chr_add):
blocks[spe_ids[spe]].append([])
blocks[spe_ids[spe]][chm[chromo.upper()]].append([start, end, bloc_id, orient])
return chm, bloc_id