Source code for procars.step_modules.compute_initialpqtree

#!/usr/bin/env python
# -*- coding: utf-8 -*-:

"""
Copyright © Bonsai - LIFL (Université Lille 1, CNRS UMR 8022) and Inria-Lille Nord Europe

contact: aida.ouangraoua@inria.fr, amandine.perrin@inria.fr

This software is a computer program whose purpose is to progressively reconstruct ancestral
gene orders.

This software is governed by the CeCILL-B license under French law and
abiding by the rules of distribution of free software. You can use,
modify and/or redistribute the software under the terms of the CeCILL-B
license as circulated by CEA, CNRS and Inria at the following URL
http://www.cecill.info, or in the LICENCE file at the root directory of this program.

The fact that you are presently reading this means that you have had
knowledge of the CeCILL-B license and that you accept its terms.

---------------------------------------------------

``compute_initialpqtree`` **module description**:

This module reads the source files (tree file and blocks file), stores information,
and fills a CAR file with an initial set of CARs, each composed of a single block.

It needs, as input :

 * a species tree where a node is marked as the ancestor: @
 * a set of universal synteny blocks
 * a CAR file name (in which initial PQtree will be saved)

.. moduleauthor:: Aïda Ouangraoua, Amandine PERRIN

February 2014

"""
from procars.utils import utils_IO
from procars.utils import initial_files_reader as read_init


[docs]def mark_ancestors(tree, ancestor_id): """ Method marking tree nodes which are on the path between the ancestor node and the root node Parameters ---------- tree : dict Tree structure : {node_id: [node_name, parent_id, left_child_id, right_child_id], ...} where id = -1 if no parent/left_child/right_child and node_name = *"N"* + *node_id* for internal nodes, or *leaf_name* for leaf nodes ancestor_id : int ID of ancestor node in tree Returns ------- list 0-1 array such that path[node_id] = 1 if node is on the path between ancestor node and root, and otherwise path[node_id] = 0 """ nb_nodes = len(tree.keys()) # number of nodes in tree path = [0 for _ in range(nb_nodes + 1)] cur_node = ancestor_id while(cur_node != -1): path[cur_node] = 1 cur_node = tree[cur_node][1] # Next node is the parent of current node return path
[docs]def list_leaves_internals(tree, root_id, path): """ Function computing a list of leaves and a postfix ordered list of internal nodes for a tree Recursive algorithm visiting nodes from ancestor node to leaves, in a prefix order. Parameters ---------- tree : dict Tree structure : {node_id: [node_name, parent_id, left_child_id, right_child_id], ...} where id = -1 if no parent/left_child/right_child and node_name = *"N" + node_id* for internal nodes, or *leaf_name* for leaf nodes root_id : int The node considered as the new root of the tree path : list 0-1 array such that path[node_id] = 1 if node is on the path between ancestor node and root, and otherwise path[node_id]=0 Returns ------- list *leaves_internals:* list containing two lists: a list of leaves ids and an ordered list of internal node ids """ leaves_internals = [[], []] # array, (leave ids array, internal node ids array) # if id is a leaf if(tree[root_id][2] == -1 and tree[root_id][3] == -1): # add it in leaves array leaves_internals = [[root_id], []] else: # add it in internal nodes array leaves_internals = [[], [root_id]] # recover child1, child2, and parent id child1_id = tree[root_id][2] child2_id = tree[root_id][3] parent_id = tree[root_id][1] # add new leaves at the end of leaves array # add new internal node at the beginning of internal nodes array # recover list of leaves, and list of internal nodes from child1 if(path[child1_id] != 1): leaves_internals_child1 = list_leaves_internals(tree, child1_id, path) leaves_internals[0] = leaves_internals[0] + leaves_internals_child1[0] leaves_internals[1] = leaves_internals_child1[1] + leaves_internals[1] # recover list of leaves, and list of internal nodes from child2 if(path[child2_id] != 1): leaves_internals_child2 = list_leaves_internals(tree, child2_id, path) leaves_internals[0] = leaves_internals[0] + leaves_internals_child2[0] leaves_internals[1] = leaves_internals_child2[1] + leaves_internals[1] # recover list of leaves, and list of internal nodes from parent if(path[root_id] == 1): if(parent_id != -1): leaves_internals_parent = list_leaves_internals(tree, parent_id, path) leaves_internals[0] = leaves_internals[0] + leaves_internals_parent[0] leaves_internals[1] = leaves_internals_parent[1] + leaves_internals[1] return leaves_internals
[docs]def compute_parameters_from_tree(tree_file_name): """ Function computing parameters from a tree file Parameters ---------- tree_file_name : String Name of file containing the species tree to read to extract parameters Returns ------- tuple *tree:* dict, tree structure *ancestor_id:* int, id of ancestor node in tree *path:* list, array of 0 and 1 indicating nodes that are ancestor of ancestor node *leaves:* list, array of integers containing a list of leave ids *internals:* list, integer array containing an ordered list of internal nodes ids in a postfix order with ancestor node as root """ tree, ancestor_id = read_init.read_treefile(tree_file_name) path = mark_ancestors(tree, ancestor_id) leaves, ordered_internals = list_leaves_internals(tree, ancestor_id, path) return tree, ancestor_id, path, leaves, ordered_internals
[docs]def partition_species(species1, species2, species3, tree, ancestor_id, cur_id, partition_id): """ Function computing the partition of species defined by the ancestor node Recursive algorithm Parameters ---------- species1 : list Species names in group 1 array to be filled species2 : list Species names in group 2 array to be filled species3 : list Species names in group 3 array to be filled tree : dict Tree structure : {node_id: [node_name, parent_id, left_child_id, right_child_id], ...} Where *id* = -1 if no parent/left_child/right_child And node_name = *"N" + node_id* for internal nodes, or *leaf_name* for leaf nodes ancestor_id : int ID of ancestor node in tree cur_id : int ID of current node during the tree transversal partition_id : int ID of current group in partition Returns ------- list *species1:* String array, names of group1 species *species2:* String array, names of group2 species *species3:* String array, names of group3 species """ child1 = tree[cur_id][2] child2 = tree[cur_id][3] # if node is leaf (species) spe_groups = [species1, species2, species3] if(child1 == -1 and child2 == -1): spe_name = tree[cur_id][0] spe_groups[partition_id - 1].append(spe_name) # if node is an internal node else: # if node is the ancestor if(cur_id == ancestor_id): partition_species(species1, species2, species3, tree, ancestor_id, child1, 2) partition_species(species1, species2, species3, tree, ancestor_id, child2, 3) # if node is not the ancestor else: partition_species(species1, species2, species3, tree, ancestor_id, child1, partition_id) partition_species(species1, species2, species3, tree, ancestor_id, child2, partition_id) return species1, species2, species3
[docs]def compute_partition_and_species_id(tree, ancestor_id): """ Function calling the method partitionning the tree into 3 groups defined by the ancestor, and determining the 3 groups of species IDs, as well as the list of species with their corresponding ID. Parameters ---------- tree : dict Tree structure : {node_id: [node_name, parent_id, left_child_id, right_child_id], ...} Where *id* = -1 if no parent/left_child/right_child And node_name = *"N" + node_id* for internal nodes, or *leaf_name* for leaf nodes ancestor_id : int ID of ancestor node in tree Returns ------- tuple *species1_id:* integer array, ids of group1 species *species2_id:* integer array, ids of group2 species *species3_id:* integer array, ids of group3 species *spe_ids:* map such that spe_ids[spe_name] = spe_id """ species1 = [] # array of output species names species2 = [] # array of input1 species names species3 = [] # array of input2 species names partition_species(species1, species2, species3, tree, ancestor_id, 1, 1) spe_ids = {spe: cur_id for cur_id, spe in enumerate(species1 + species2 + species3)} species1_id = [spe_ids[spe] for spe in species1] species2_id = [spe_ids[spe] for spe in species2] species3_id = [spe_ids[spe] for spe in species3] return species1_id, species2_id, species3_id, spe_ids
[docs]def commons_read_tree(tree_file, binary_file_name): """ Reading, and computing parameters and partitions from a tree file, and saving values computed in a binary file. Parameters ---------- tree_file_name : String Name of file containing species tree binary_file_name : String Name of file where results must be stored Returns ------- list *nb_species:* int, total number of species *spe_ids:* dict, {species1: id1, species2: id2, ...} """ # read and compute parameters from species tree file tree, ancestor_id, path, leaves, ordered_internals = compute_parameters_from_tree(tree_file) nb_species = len(leaves) # number of species in tree # Ancestor node defines a partition of species into 3 groups: determine it res = compute_partition_and_species_id(tree, ancestor_id) species1_id, species2_id, species3_id, spe_ids = res # save computed values in a binary file values_saved = [tree, ancestor_id, path, leaves, ordered_internals, nb_species, species1_id, species2_id, species3_id, spe_ids] utils_IO.save_binary_information(binary_file_name, values_saved) return nb_species, spe_ids
[docs]def compute_blocks_neighbors(nb_blocks, nb_species, blocks): """ Function computing block neighbors Parameters ---------- nb_blocks : int Total number of blocks nb_species : int Total number of species blocks : list List of lists blocks = [[*list of blocks for spe_id=0*], [*list of blocks for spe_id=1*], ...] with *list of blocks for spe_id=1* = [[*list of blocks for spe_id=1, chromosome 1*], [*list of blocks for spe_id=1, chromosome 2*], ...] with *list of blocks for spe_id=1, chromosome 1* = [[start, end, num_bloc, orient], [start, end, num_bloc, orient], ... for the n blocks] Returns ------- tuple *left :* array such that left[spe_id][block_id] is an array containing the left neighbor of block block_id in spe_id *right :* array such that right[spe_id][block_id] is an array containing the right neighbor of block_id in spe_id """ left = [] # left = for each species : [[0, block_before0], [0, block_before1], [0, block_before2], ...] right = [] # right = for each species : [[0], [0, block_after1], [0, block_after2], ...] # add cells of arrays left and right for spe_id in range(nb_species): left.append([]) right.append([]) for _ in range(nb_blocks + 1): left[spe_id].append([0]) right[spe_id].append([0]) for chr_id in range(len(blocks[spe_id])): # sort blocks by start position blocks[spe_id][chr_id].sort() # for each species for spe_id in range(nb_species): # for each chromosomes in species for chr_id in range(len(blocks[spe_id])): last_block_id = 0 last_block_dir = 0 # for each block in chromosome for signed_block in blocks[spe_id][chr_id]: block_id = signed_block[2] block_dir = signed_block[3] # compute left and right neighbor if(block_dir == 1): left[spe_id][block_id].append(last_block_id * last_block_dir) elif(block_dir == -1): right[spe_id][block_id].append(-last_block_id * last_block_dir) if(last_block_dir == 1): right[spe_id][last_block_id].append(block_id * block_dir) elif(last_block_dir == -1): left[spe_id][last_block_id].append(-block_id * block_dir) last_block_id = block_id last_block_dir = block_dir if(last_block_dir == 1): right[spe_id][last_block_id].append(0) else: left[spe_id][last_block_id].append(0) return left, right
[docs]def commons_read_blocks(block_file_name, nb_species, spe_ids, binary_file_name): """ Function reading blocks from a block file and computing block neighbors. Saves all information about blocks into a binary file, used in the next steps Parameters ---------- block_file_name : String Name of file containing the ordered blocks for all species nb_species : int Total number of species spe_ids : dict Species and their corresponding ID {spe1: id1, spe2: id2, ...} binary_file_name : String Name of the file in which saving all information about the blocks Returns ------- int number of blocks """ # read blocks file blocks, nb_blocks = read_init.read_block_file(block_file_name, nb_species, spe_ids) # Find previous and next blocks left, right = compute_blocks_neighbors(nb_blocks, nb_species, blocks) # save computed values in a binary file values_saved = [blocks, nb_blocks, left, right] utils_IO.save_binary_information(binary_file_name, values_saved) return nb_blocks
[docs]def write_init_cars_file(car_file_name, nb_blocks): """ Creates a CAR file, with an initial set of CARs, each composed of a single block Parameters ---------- car_file_name : String File in which saving the CARs nb_blocks : int Total number of blocks = number of CARs in the output file """ with open(car_file_name, "w") as output_file: for num in range(1, nb_blocks + 1): output_file.write("#CAR" + str(num) + "\n") output_file.write("_Q " + str(num) + " Q_\n")
[docs]def main(tree_file_name, block_file_name, car_file_name, tree_bin, blocks_bin, adj_file): """ Main method. Reads tree file and block file, stores information into binary files, and creates a CAR file with an initial set of CARs, each composed of a single block. Also creates an empty adjacency file (no adjacency added at step 0). Parameters ---------- tree_file_name : string Name of the species tree to read block_file_name : string Name of the blocks file to read car_file_name : string Name of PQtree file in which saving the initial set of CARs tree_bin : string Name of binary file in which saving all information found in tree file blocks_bin : string Name of binary file in which saving all information found in blocks file adj_file : string Name of adjacency file in which saving adjacencies added at step0 (= no adjacency -> empty file) Returns ------- int total number of blocks """ # Step1 : reading tree file nb_species, species_ids = commons_read_tree(tree_file_name, tree_bin) # Step2 : reading blocks file nb_blocks = commons_read_blocks(block_file_name, nb_species, species_ids, blocks_bin) # Step3 : Writing CARs write_init_cars_file(car_file_name, nb_blocks) # Write adjacency file for step0: empty adjacency file open(adj_file, 'a').close() open(adj_file + "_discarded.txt", "a").close() print("... " + str(nb_blocks) + " cars") return nb_blocks