Source code for procars.step_modules.compute_resolved_conflicts

#!/usr/bin/env python
# -*- coding: utf-8 -*-:

"""
Copyright © Bonsai - LIFL (Université Lille 1, CNRS UMR 8022) and Inria-Lille Nord Europe

contact: aida.ouangraoua@inria.fr, amandine.perrin@inria.fr

This software is a computer program whose purpose is to progressively reconstruct ancestral
gene orders.

This software is governed by the CeCILL-B license under French law and
abiding by the rules of distribution of free software. You can use,
modify and/or redistribute the software under the terms of the CeCILL-B
license as circulated by CEA, CNRS and Inria at the following URL
http://www.cecill.info, or in the LICENCE file at the root directory of this program.

The fact that you are presently reading this means that you have had
knowledge of the CeCILL-B license and that you accept its terms.

---------------------------------------------------

``compute_resolved_conflicts`` **module description**:

This module is used to resolve conflicts in a given set of adjacencies, when, at the previous
step, the set of non-conflicting adjacencies was empty.


*Method:*

From a set of conflicting conserved adjacencies with the set of species supporting each
adjacencies, it computes a maximum set of non-conflicting adjacencies.
We use the Sankoffs algorithm's for the minimal mutation problem.

.. moduleauthor:: Aïda Ouangraoua, Amandine PERRIN

February 2014
"""


from procars.utils import utils_IO
import itertools
import sys
import shutil


[docs]def compute_cost(tree, path, leaves, internals, labels, nb_labels): """ Function computing optimal labels for the ancestor node in a tree Dynamic programming version of the Fitch Algorithm for the Small Parsimony Problem (Fitch 71) Parameters ---------- tree: dict Tree structure: {node_id: [node_name, parent_id, left_child_id, right_child_id], ...} Where *id* = -1 if no parent/left_child/right_child And node_name = *"N" + node_id* for internal nodes, or *leaf_name* for leaf nodes path : list Array, associating value 1 to a node if it is on the path between ancestor and root node, otherwise value 0 leaves : list List of leaf IDs in the tree internals : list Ordered list of internal node IDs in the tree labels : dict Containing labels of leaves nodes: 0 if adjacency is absent, 1 if present nb_labels : int Number of label types Returns ------- list *tab:* costs for each node of the tree """ nb_nodes = len(tree) max_cost = sys.maxint cost = 0 tab = [] for i in range(nb_nodes + 1): tab.append([]) for label_id in range(nb_labels + 1): tab[i].append(0) tab[i][0] = max_cost for leaf in leaves: if(len(labels[leaf]) > 0): for label_id in range(1, nb_labels + 1): if(label_id in labels[leaf]): tab[leaf][label_id] = 0 else: tab[leaf][label_id] = max_cost for node_id in internals: child1_id = tree[node_id][2] child2_id = tree[node_id][3] parent_id = tree[node_id][1] for label_id in range(1, nb_labels + 1): cost = 0 if(path[child1_id] != 1): cost += min([min(tab[child1_id]) + 1, tab[child1_id][label_id]]) if(path[child2_id] != 1): cost += min([min(tab[child2_id]) + 1, tab[child2_id][label_id]]) if(path[node_id] == 1): if(parent_id != -1): cost += min([min(tab[parent_id]) + 1, tab[parent_id][label_id]]) tab[node_id][label_id] = cost return tab
[docs]def get_adjacencies_costs(adj_file, nb_species, leaves, tree, spe_ids, path, internals, ancestor_id): """ Read the file where discarded adjacencies were stored, and calculate the cost of each of these adjacencies Parameters ---------- adj_file : string File in which discarded adjacencies are written nb_species : int Total number of species leaves : list List of IDs of tree leaves (= genomes) tree : dict Tree structure {node_id: [node_name, parent_id, left_child_id, right_child_id], ...} Where *id* = -1 if no parent/left_child/right_child And node_name = *"N" + node_id* for internal nodes, or *leaf_name* for leaf nodes spe_ids : dict Species as keys, and their corresponding ID as value path : list Array, associating value 1 to a node if it is on the path between ancestor and root node, otherwise value 0 internals : list IDs of internal nodes of the tree ancestor_id : int ID of the ancestor node Returns ------- tuple - *adjacencies_score:* dict such that adjacencies_score[id] = score of the adjacency with ID *id* - *adj_ids:* dict for corresponding adjacencies and adjacency IDs: adj_ids[id] = (bloc1, bloc2) - *step_car_adjs:* dict such that step_car_adjs[id] = (car1, car2, type, step, labels) where *car1, car2* corresponds to the car adjacency corresponding to the adjacency with ID *id*, *type* is the type of adjacency, and *step* is the step at which the adjacency was found, and *labels* is some information about the presence (1) or absence (0) of the adjacency in each species. """ adjacencies_score = {} adj_ids = {} step_car_adjs = {} iterator = utils_IO.read_conflict_adj_file(adj_file, nb_species, leaves, tree, spe_ids) for labels, adj_id, adj, step_car_adj in iterator: tab = compute_cost(tree, path, leaves, internals, labels, 2) cost = tab[ancestor_id][2] adjacencies_score[adj_id] = cost adj_ids[adj_id] = adj step_car_adjs[adj_id] = step_car_adj return adjacencies_score, adj_ids, step_car_adjs
[docs]def find_conflicts_couples(adj_ids): """ Find couples of adjacencies in conflict Among all possible couples of adjacencies, finds the ones which involve a common end. Parameters ---------- adj_ids : list List of IDs such that adj_ids[id] = (bloc1, bloc2) with bloc1 and bloc2 the two block numbers of the adjacency with id 'id' Returns ------- tuple *conflicts:* a list containing all couples of adjacencies IDs which are in conflict (because they share one end). """ conflicts = [] non_conflicts = [] # For each couple of two different adjacencies, check if they share a same end for adj_couple_ids in itertools.combinations(adj_ids, 2): adj_couple = (adj_ids[adj_couple_ids[0]], adj_ids[adj_couple_ids[1]]) (i1, i2), (j1, j2) = adj_couple if(i1 == j1 or i1 == -j2 or i2 == -j1 or i2 == j2): conflicts.append(adj_couple_ids) else: non_conflicts.append(adj_couple_ids) return conflicts
[docs]def find_independant_sets(nb_adjacencies, conflicts): """ Find sets of independent adjacencies From the list of conflicting adjacencies, it tries to create sets by adding new adjacencies in each set, which are compatible with all adjacencies already in the set. The set of independent (= not in conflict) adjacencies is then growing at each iteration, until we cannot add any more adjacency. Parameters ---------- nb_adjacencies : int Total number of adjacencies conflicts : int List of pairs of adjacencies IDs which are in conflict Returns ------- list *independent_sets:* list of sets of compatible adjacencies. All sets have the same size, which is the maximum possible size of an independent adjacencies set """ independent_sets = [] # array storing all independent_sets # independent_sets found at last step: last_independent_sets = [[i] for i in xrange(nb_adjacencies)] size_max_independent_set = nb_adjacencies - 1 # maximum possible size of independent_set effective_size_max = 1 # current maximum size found # boolean to check that we increased the size of independent set. If not, stop algorithm: adj_added = True while effective_size_max < size_max_independent_set and adj_added: effective_size_max += 1 adj_added = False # For each adjacency set found in the last step for cur_set in last_independent_sets: # Try to add other adjacencies in the set... for j in xrange(cur_set[-1] + 1, nb_adjacencies): is_conflicting = False # ...if not in conflict with any adjacency already in the set for adj_id in cur_set: if (adj_id, j) in conflicts or (j, adj_id) in conflicts: is_conflicting = True # if new adjacency 'j' is not in conflict with all adjs in cur_set, add it if not is_conflicting: adj_added = True independent_sets.append(list(cur_set) + [j]) # If we added an adjacency to a set, we run a new step # -> initialize sets for next step if adj_added: last_independent_sets = independent_sets independent_sets = [] return last_independent_sets
[docs]def find_mincost_set(independent_sets, adjacencies_score): """ Among all sets of compatible adjacencies of maximum size, find the one with minimum cost Parameters ---------- independent_sets : list List of sets of independent adjacencies IDs adjacencies_score : list List of scores of the adjacencies, such that adjacencies_score[*id*] = score of adjacency with ID = *id* Returns ------- list *maximum_set:* a list of compatible adjacencies, with minimum global cost """ maximum_set = [] min_cost = sys.maxint # maximum int value, not reachable by the cost for cur_set in independent_sets: cost = 0 for adj_id in cur_set: cost += adjacencies_score[adj_id] if cost < min_cost: maximum_set = cur_set min_cost = cost return maximum_set
[docs]def change_adj_discarded_files(adj_prefix_file, step): """ Save discarded adjacencies into a new file for information, and remove them from last file Parameters ---------- adj_prefix_file : string Prefix of adjacency file name step : int Current step of ProCars method """ last_adj_discard = adj_prefix_file + "_" + str(step) + ".txt_discarded.txt" new_adj_discard = adj_prefix_file + "_" + str(step) + ".txt_discarded_used.txt" # save discarded adjacencies into a new file with 'used' suffixe shutil.move(last_adj_discard, new_adj_discard) # empty discarded adjacencies file open(last_adj_discard, "w").close()
[docs]def write_retained_adjs(adj_filename_prefix, adj_infos, maximum_set, adj_ids, step): """ Write all adjacencies in the retained set of non-conflicting adjacencies (maximum size and minimum cost) into a new adjacency file. Parameters ---------- adj_filename_prefix : string Prefix of adjacency file name adj_infos : dict Dictionary with adjacency IDs as keys, and values are a list with: - the car adjacency (car1, car2) - the type of adjacency - the step at which it was found - the presence of this adjacency in each species maximum_set : list List of retained adjacency ids adj_ids : dict Keys are adjacency IDs, and values are the adjacency corresponding to the ID step : int Current step of ProCars method """ last_adjs = adj_filename_prefix + "_" + str(step - 1) + ".txt" new_adjs = adj_filename_prefix + "_" + str(step) + ".txt" with open(last_adjs, "r") as last: all_last_adjs = last.readlines() utils_IO.write_retained_conflict_adjs(new_adjs, adj_infos, maximum_set, adj_ids) with open(new_adjs, "a") as new: new.writelines(all_last_adjs)
[docs]def main(adjacency_file_name, step, tree_bin): """ Main method. From a set of conflicting conserved adjacencies with the set of species supporting each adjacencies, it computes a maximum set of non-conflicting adjacencies. Parameters ---------- adjacency_file_name : string Prefix of the adjacency file step : int Current step of the ProCARs method tree_bin : string Name of the binary file containing all information about the species tree. Returns ------- tuple *maximum_set:* list List of retained adjacency ids (one of the independent_sets) *independent_sets:* list List of compatible adjacency sets. All sets have the same size, which is the maximum possible size of an independent adjacencies set """ # Define filenames conflicts_filename = adjacency_file_name + "_" + str(step) + ".txt_discarded.txt" # Reading species tree file [tree, ancestor_id, path, leaves, ordered_internals, nb_species, _, _, _, spe_ids] = utils_IO.read_binary_file(tree_bin) # Get conflicting adjacencies of last step adj_res = get_adjacencies_costs(conflicts_filename, nb_species, leaves, tree, spe_ids, path, ordered_internals, ancestor_id) adjacencies_score, adj_ids, adj_infos = adj_res # Find conflicting adjacencies two by two conflicts = find_conflicts_couples(adj_ids) # Find independant sets of adjacencies independent_sets = find_independant_sets(len(adj_ids.keys()), conflicts) # Computing the total cost of each maximum size independent set and choosing a # minimum cost independent set maximum_set = find_mincost_set(independent_sets, adjacencies_score) # Writting the retained adjacencies write_retained_adjs(adjacency_file_name, adj_infos, maximum_set, adj_ids, step) change_adj_discarded_files(adjacency_file_name, step) print("... " + str(len(maximum_set)) + " retained adjacencies\n") return maximum_set, independent_sets