#!/usr/bin/env python
# -*- coding: utf-8 -*-:
"""
Copyright © Bonsai - LIFL (Université Lille 1, CNRS UMR 8022) and Inria-Lille Nord Europe
contact: aida.ouangraoua@inria.fr, amandine.perrin@inria.fr
This software is a computer program whose purpose is to progressively reconstruct ancestral
gene orders.
This software is governed by the CeCILL-B license under French law and
abiding by the rules of distribution of free software. You can use,
modify and/or redistribute the software under the terms of the CeCILL-B
license as circulated by CEA, CNRS and Inria at the following URL
http://www.cecill.info, or in the LICENCE file at the root directory of this program.
The fact that you are presently reading this means that you have had
knowledge of the CeCILL-B license and that you accept its terms.
---------------------------------------------------
``compute_initialpqtree`` **module description**:
This module reads the source files (tree file and blocks file), stores information,
and fills a CAR file with an initial set of CARs, each composed of a single block.
It needs, as input :
* a species tree where a node is marked as the ancestor: @
* a set of universal synteny blocks
* a CAR file name (in which initial PQtree will be saved)
.. moduleauthor:: Aïda Ouangraoua, Amandine PERRIN
February 2014
"""
from procars.utils import utils_IO
from procars.utils import initial_files_reader as read_init
[docs]def mark_ancestors(tree, ancestor_id):
""" Method marking tree nodes which are on the path between the ancestor node and the root node
Parameters
----------
tree : dict
Tree structure :
{node_id: [node_name, parent_id, left_child_id, right_child_id], ...}
where id = -1 if no parent/left_child/right_child
and node_name = *"N"* + *node_id* for internal nodes, or *leaf_name* for leaf nodes
ancestor_id : int
ID of ancestor node in tree
Returns
-------
list
0-1 array such that path[node_id] = 1 if node is on the path between ancestor node and root,
and otherwise path[node_id] = 0
"""
nb_nodes = len(tree.keys()) # number of nodes in tree
path = [0 for _ in range(nb_nodes + 1)]
cur_node = ancestor_id
while(cur_node != -1):
path[cur_node] = 1
cur_node = tree[cur_node][1] # Next node is the parent of current node
return path
[docs]def list_leaves_internals(tree, root_id, path):
""" Function computing a list of leaves and a postfix ordered list of internal nodes for a tree
Recursive algorithm visiting nodes from ancestor node to leaves, in a prefix order.
Parameters
----------
tree : dict
Tree structure :
{node_id: [node_name, parent_id, left_child_id, right_child_id], ...}
where id = -1 if no parent/left_child/right_child
and node_name = *"N" + node_id* for internal nodes, or *leaf_name* for leaf nodes
root_id : int
The node considered as the new root of the tree
path : list
0-1 array such that path[node_id] = 1 if node is on the path between ancestor node and root,
and otherwise path[node_id]=0
Returns
-------
list
*leaves_internals:* list containing two lists: a list of leaves ids and an ordered list of
internal node ids
"""
leaves_internals = [[], []] # array, (leave ids array, internal node ids array)
# if id is a leaf
if(tree[root_id][2] == -1 and tree[root_id][3] == -1):
# add it in leaves array
leaves_internals = [[root_id], []]
else:
# add it in internal nodes array
leaves_internals = [[], [root_id]]
# recover child1, child2, and parent id
child1_id = tree[root_id][2]
child2_id = tree[root_id][3]
parent_id = tree[root_id][1]
# add new leaves at the end of leaves array
# add new internal node at the beginning of internal nodes array
# recover list of leaves, and list of internal nodes from child1
if(path[child1_id] != 1):
leaves_internals_child1 = list_leaves_internals(tree, child1_id, path)
leaves_internals[0] = leaves_internals[0] + leaves_internals_child1[0]
leaves_internals[1] = leaves_internals_child1[1] + leaves_internals[1]
# recover list of leaves, and list of internal nodes from child2
if(path[child2_id] != 1):
leaves_internals_child2 = list_leaves_internals(tree, child2_id, path)
leaves_internals[0] = leaves_internals[0] + leaves_internals_child2[0]
leaves_internals[1] = leaves_internals_child2[1] + leaves_internals[1]
# recover list of leaves, and list of internal nodes from parent
if(path[root_id] == 1):
if(parent_id != -1):
leaves_internals_parent = list_leaves_internals(tree, parent_id, path)
leaves_internals[0] = leaves_internals[0] + leaves_internals_parent[0]
leaves_internals[1] = leaves_internals_parent[1] + leaves_internals[1]
return leaves_internals
[docs]def compute_parameters_from_tree(tree_file_name):
""" Function computing parameters from a tree file
Parameters
----------
tree_file_name : String
Name of file containing the species tree to read to extract parameters
Returns
-------
tuple
*tree:* dict, tree structure
*ancestor_id:* int, id of ancestor node in tree
*path:* list, array of 0 and 1 indicating nodes that are ancestor of ancestor node
*leaves:* list, array of integers containing a list of leave ids
*internals:* list, integer array containing an ordered list of internal nodes ids in a
postfix order with ancestor node as root
"""
tree, ancestor_id = read_init.read_treefile(tree_file_name)
path = mark_ancestors(tree, ancestor_id)
leaves, ordered_internals = list_leaves_internals(tree, ancestor_id, path)
return tree, ancestor_id, path, leaves, ordered_internals
[docs]def partition_species(species1, species2, species3, tree, ancestor_id, cur_id, partition_id):
""" Function computing the partition of species defined by the ancestor node
Recursive algorithm
Parameters
----------
species1 : list
Species names in group 1 array to be filled
species2 : list
Species names in group 2 array to be filled
species3 : list
Species names in group 3 array to be filled
tree : dict
Tree structure :
{node_id: [node_name, parent_id, left_child_id, right_child_id], ...}
Where *id* = -1 if no parent/left_child/right_child
And node_name = *"N" + node_id* for internal nodes, or *leaf_name* for leaf nodes
ancestor_id : int
ID of ancestor node in tree
cur_id : int
ID of current node during the tree transversal
partition_id : int
ID of current group in partition
Returns
-------
list
*species1:* String array, names of group1 species
*species2:* String array, names of group2 species
*species3:* String array, names of group3 species
"""
child1 = tree[cur_id][2]
child2 = tree[cur_id][3]
# if node is leaf (species)
spe_groups = [species1, species2, species3]
if(child1 == -1 and child2 == -1):
spe_name = tree[cur_id][0]
spe_groups[partition_id - 1].append(spe_name)
# if node is an internal node
else:
# if node is the ancestor
if(cur_id == ancestor_id):
partition_species(species1, species2, species3, tree, ancestor_id, child1, 2)
partition_species(species1, species2, species3, tree, ancestor_id, child2, 3)
# if node is not the ancestor
else:
partition_species(species1, species2, species3, tree, ancestor_id, child1, partition_id)
partition_species(species1, species2, species3, tree, ancestor_id, child2, partition_id)
return species1, species2, species3
[docs]def compute_partition_and_species_id(tree, ancestor_id):
""" Function calling the method partitionning the tree into 3 groups defined by the
ancestor, and determining the 3 groups of species IDs, as well as the list of species with
their corresponding ID.
Parameters
----------
tree : dict
Tree structure :
{node_id: [node_name, parent_id, left_child_id, right_child_id], ...}
Where *id* = -1 if no parent/left_child/right_child
And node_name = *"N" + node_id* for internal nodes, or *leaf_name* for leaf nodes
ancestor_id : int
ID of ancestor node in tree
Returns
-------
tuple
*species1_id:* integer array, ids of group1 species
*species2_id:* integer array, ids of group2 species
*species3_id:* integer array, ids of group3 species
*spe_ids:* map such that spe_ids[spe_name] = spe_id
"""
species1 = [] # array of output species names
species2 = [] # array of input1 species names
species3 = [] # array of input2 species names
partition_species(species1, species2, species3, tree, ancestor_id, 1, 1)
spe_ids = {spe: cur_id for cur_id, spe in enumerate(species1 + species2 + species3)}
species1_id = [spe_ids[spe] for spe in species1]
species2_id = [spe_ids[spe] for spe in species2]
species3_id = [spe_ids[spe] for spe in species3]
return species1_id, species2_id, species3_id, spe_ids
[docs]def commons_read_tree(tree_file, binary_file_name):
"""
Reading, and computing parameters and partitions from a tree file, and saving values computed
in a binary file.
Parameters
----------
tree_file_name : String
Name of file containing species tree
binary_file_name : String
Name of file where results must be stored
Returns
-------
list
*nb_species:* int, total number of species
*spe_ids:* dict, {species1: id1, species2: id2, ...}
"""
# read and compute parameters from species tree file
tree, ancestor_id, path, leaves, ordered_internals = compute_parameters_from_tree(tree_file)
nb_species = len(leaves) # number of species in tree
# Ancestor node defines a partition of species into 3 groups: determine it
res = compute_partition_and_species_id(tree, ancestor_id)
species1_id, species2_id, species3_id, spe_ids = res
# save computed values in a binary file
values_saved = [tree, ancestor_id, path, leaves, ordered_internals, nb_species,
species1_id, species2_id, species3_id, spe_ids]
utils_IO.save_binary_information(binary_file_name, values_saved)
return nb_species, spe_ids
[docs]def compute_blocks_neighbors(nb_blocks, nb_species, blocks):
""" Function computing block neighbors
Parameters
----------
nb_blocks : int
Total number of blocks
nb_species : int
Total number of species
blocks : list
List of lists
blocks = [[*list of blocks for spe_id=0*], [*list of blocks for spe_id=1*], ...]
with *list of blocks for spe_id=1* =
[[*list of blocks for spe_id=1, chromosome 1*],
[*list of blocks for spe_id=1, chromosome 2*],
...]
with *list of blocks for spe_id=1, chromosome 1* =
[[start, end, num_bloc, orient],
[start, end, num_bloc, orient],
... for the n blocks]
Returns
-------
tuple
*left :* array such that left[spe_id][block_id] is an array containing the left
neighbor of block block_id in spe_id
*right :* array such that right[spe_id][block_id] is an array containing the right
neighbor of block_id in spe_id
"""
left = []
# left = for each species : [[0, block_before0], [0, block_before1], [0, block_before2], ...]
right = []
# right = for each species : [[0], [0, block_after1], [0, block_after2], ...]
# add cells of arrays left and right
for spe_id in range(nb_species):
left.append([])
right.append([])
for _ in range(nb_blocks + 1):
left[spe_id].append([0])
right[spe_id].append([0])
for chr_id in range(len(blocks[spe_id])):
# sort blocks by start position
blocks[spe_id][chr_id].sort()
# for each species
for spe_id in range(nb_species):
# for each chromosomes in species
for chr_id in range(len(blocks[spe_id])):
last_block_id = 0
last_block_dir = 0
# for each block in chromosome
for signed_block in blocks[spe_id][chr_id]:
block_id = signed_block[2]
block_dir = signed_block[3]
# compute left and right neighbor
if(block_dir == 1):
left[spe_id][block_id].append(last_block_id * last_block_dir)
elif(block_dir == -1):
right[spe_id][block_id].append(-last_block_id * last_block_dir)
if(last_block_dir == 1):
right[spe_id][last_block_id].append(block_id * block_dir)
elif(last_block_dir == -1):
left[spe_id][last_block_id].append(-block_id * block_dir)
last_block_id = block_id
last_block_dir = block_dir
if(last_block_dir == 1):
right[spe_id][last_block_id].append(0)
else:
left[spe_id][last_block_id].append(0)
return left, right
[docs]def commons_read_blocks(block_file_name, nb_species, spe_ids, binary_file_name):
""" Function reading blocks from a block file and computing block neighbors. Saves all
information about blocks into a binary file, used in the next steps
Parameters
----------
block_file_name : String
Name of file containing the ordered blocks for all species
nb_species : int
Total number of species
spe_ids : dict
Species and their corresponding ID {spe1: id1, spe2: id2, ...}
binary_file_name : String
Name of the file in which saving all information about the blocks
Returns
-------
int
number of blocks
"""
# read blocks file
blocks, nb_blocks = read_init.read_block_file(block_file_name, nb_species, spe_ids)
# Find previous and next blocks
left, right = compute_blocks_neighbors(nb_blocks, nb_species, blocks)
# save computed values in a binary file
values_saved = [blocks, nb_blocks, left, right]
utils_IO.save_binary_information(binary_file_name, values_saved)
return nb_blocks
[docs]def write_init_cars_file(car_file_name, nb_blocks):
""" Creates a CAR file, with an initial set of CARs, each composed of a single block
Parameters
----------
car_file_name : String
File in which saving the CARs
nb_blocks : int
Total number of blocks = number of CARs in the output file
"""
with open(car_file_name, "w") as output_file:
for num in range(1, nb_blocks + 1):
output_file.write("#CAR" + str(num) + "\n")
output_file.write("_Q " + str(num) + " Q_\n")
[docs]def main(tree_file_name, block_file_name, car_file_name, tree_bin, blocks_bin, adj_file):
"""
Main method.
Reads tree file and block file, stores information into binary files, and creates a
CAR file with an initial set of CARs, each composed of a single block. Also creates an
empty adjacency file (no adjacency added at step 0).
Parameters
----------
tree_file_name : string
Name of the species tree to read
block_file_name : string
Name of the blocks file to read
car_file_name : string
Name of PQtree file in which saving the initial set of CARs
tree_bin : string
Name of binary file in which saving all information found in tree file
blocks_bin : string
Name of binary file in which saving all information found in blocks file
adj_file : string
Name of adjacency file in which saving adjacencies added at step0
(= no adjacency -> empty file)
Returns
-------
int
total number of blocks
"""
# Step1 : reading tree file
nb_species, species_ids = commons_read_tree(tree_file_name, tree_bin)
# Step2 : reading blocks file
nb_blocks = commons_read_blocks(block_file_name, nb_species, species_ids, blocks_bin)
# Step3 : Writing CARs
write_init_cars_file(car_file_name, nb_blocks)
# Write adjacency file for step0: empty adjacency file
open(adj_file, 'a').close()
open(adj_file + "_discarded.txt", "a").close()
print("... " + str(nb_blocks) + " cars")
return nb_blocks