#!/usr/bin/env python
# -*- coding: utf-8 -*-:
"""
Copyright © Bonsai - LIFL (Université Lille 1, CNRS UMR 8022) and Inria-Lille Nord Europe
contact: aida.ouangraoua@inria.fr, amandine.perrin@inria.fr
This software is a computer program whose purpose is to progressively reconstruct ancestral
gene orders.
This software is governed by the CeCILL-B license under French law and
abiding by the rules of distribution of free software. You can use,
modify and/or redistribute the software under the terms of the CeCILL-B
license as circulated by CEA, CNRS and Inria at the following URL
http://www.cecill.info, or in the LICENCE file at the root directory of this program.
The fact that you are presently reading this means that you have had
knowledge of the CeCILL-B license and that you accept its terms.
---------------------------------------------------
``compute_resolved_conflicts`` **module description**:
This module is used to resolve conflicts in a given set of adjacencies, when, at the previous
step, the set of non-conflicting adjacencies was empty.
*Method:*
From a set of conflicting conserved adjacencies with the set of species supporting each
adjacencies, it computes a maximum set of non-conflicting adjacencies.
We use the Sankoffs algorithm's for the minimal mutation problem.
.. moduleauthor:: Aïda Ouangraoua, Amandine PERRIN
February 2014
"""
from procars.utils import utils_IO
import itertools
import sys
import shutil
[docs]def compute_cost(tree, path, leaves, internals, labels, nb_labels):
""" Function computing optimal labels for the ancestor node in a tree
Dynamic programming version of the Fitch Algorithm for the Small Parsimony Problem (Fitch 71)
Parameters
----------
tree: dict
Tree structure:
{node_id: [node_name, parent_id, left_child_id, right_child_id], ...}
Where *id* = -1 if no parent/left_child/right_child
And node_name = *"N" + node_id* for internal nodes, or *leaf_name* for leaf nodes
path : list
Array, associating value 1 to a node if it is on the path between ancestor and root node,
otherwise value 0
leaves : list
List of leaf IDs in the tree
internals : list
Ordered list of internal node IDs in the tree
labels : dict
Containing labels of leaves nodes: 0 if adjacency is absent, 1 if present
nb_labels : int
Number of label types
Returns
-------
list
*tab:* costs for each node of the tree
"""
nb_nodes = len(tree)
max_cost = sys.maxint
cost = 0
tab = []
for i in range(nb_nodes + 1):
tab.append([])
for label_id in range(nb_labels + 1):
tab[i].append(0)
tab[i][0] = max_cost
for leaf in leaves:
if(len(labels[leaf]) > 0):
for label_id in range(1, nb_labels + 1):
if(label_id in labels[leaf]):
tab[leaf][label_id] = 0
else:
tab[leaf][label_id] = max_cost
for node_id in internals:
child1_id = tree[node_id][2]
child2_id = tree[node_id][3]
parent_id = tree[node_id][1]
for label_id in range(1, nb_labels + 1):
cost = 0
if(path[child1_id] != 1):
cost += min([min(tab[child1_id]) + 1, tab[child1_id][label_id]])
if(path[child2_id] != 1):
cost += min([min(tab[child2_id]) + 1, tab[child2_id][label_id]])
if(path[node_id] == 1):
if(parent_id != -1):
cost += min([min(tab[parent_id]) + 1, tab[parent_id][label_id]])
tab[node_id][label_id] = cost
return tab
[docs]def get_adjacencies_costs(adj_file, nb_species, leaves, tree, spe_ids,
path, internals, ancestor_id):
""" Read the file where discarded adjacencies were stored, and calculate the cost of each
of these adjacencies
Parameters
----------
adj_file : string
File in which discarded adjacencies are written
nb_species : int
Total number of species
leaves : list
List of IDs of tree leaves (= genomes)
tree : dict
Tree structure
{node_id: [node_name, parent_id, left_child_id, right_child_id], ...}
Where *id* = -1 if no parent/left_child/right_child
And node_name = *"N" + node_id* for internal nodes, or *leaf_name* for leaf nodes
spe_ids : dict
Species as keys, and their corresponding ID as value
path : list
Array, associating value 1 to a node if it is on the path between ancestor and root node,
otherwise value 0
internals : list
IDs of internal nodes of the tree
ancestor_id : int
ID of the ancestor node
Returns
-------
tuple
- *adjacencies_score:* dict such that adjacencies_score[id] = score
of the adjacency with ID *id*
- *adj_ids:* dict for corresponding adjacencies and adjacency IDs:
adj_ids[id] = (bloc1, bloc2)
- *step_car_adjs:* dict such that step_car_adjs[id] = (car1, car2, type, step, labels)
where *car1, car2* corresponds to the car adjacency corresponding
to the adjacency with ID *id*, *type* is the type of adjacency,
and *step* is the step at which the adjacency was found, and *labels* is some
information about the presence (1) or absence (0) of the adjacency in each species.
"""
adjacencies_score = {}
adj_ids = {}
step_car_adjs = {}
iterator = utils_IO.read_conflict_adj_file(adj_file, nb_species, leaves, tree, spe_ids)
for labels, adj_id, adj, step_car_adj in iterator:
tab = compute_cost(tree, path, leaves, internals, labels, 2)
cost = tab[ancestor_id][2]
adjacencies_score[adj_id] = cost
adj_ids[adj_id] = adj
step_car_adjs[adj_id] = step_car_adj
return adjacencies_score, adj_ids, step_car_adjs
[docs]def find_conflicts_couples(adj_ids):
""" Find couples of adjacencies in conflict
Among all possible couples of adjacencies, finds the ones which involve a common end.
Parameters
----------
adj_ids : list
List of IDs such that adj_ids[id] = (bloc1, bloc2) with bloc1 and bloc2 the two block
numbers of the adjacency with id 'id'
Returns
-------
tuple
*conflicts:* a list containing all couples of adjacencies IDs which are in conflict (because
they share one end).
"""
conflicts = []
non_conflicts = []
# For each couple of two different adjacencies, check if they share a same end
for adj_couple_ids in itertools.combinations(adj_ids, 2):
adj_couple = (adj_ids[adj_couple_ids[0]], adj_ids[adj_couple_ids[1]])
(i1, i2), (j1, j2) = adj_couple
if(i1 == j1 or i1 == -j2 or i2 == -j1 or i2 == j2):
conflicts.append(adj_couple_ids)
else:
non_conflicts.append(adj_couple_ids)
return conflicts
[docs]def find_independant_sets(nb_adjacencies, conflicts):
""" Find sets of independent adjacencies
From the list of conflicting adjacencies, it tries to create sets by adding new adjacencies
in each set, which are compatible with all adjacencies already in the set.
The set of independent (= not in conflict) adjacencies is then growing at each iteration,
until we cannot add any more adjacency.
Parameters
----------
nb_adjacencies : int
Total number of adjacencies
conflicts : int
List of pairs of adjacencies IDs which are in conflict
Returns
-------
list
*independent_sets:* list of sets of compatible adjacencies. All sets have the same size,
which is the maximum possible size of an independent adjacencies set
"""
independent_sets = [] # array storing all independent_sets
# independent_sets found at last step:
last_independent_sets = [[i] for i in xrange(nb_adjacencies)]
size_max_independent_set = nb_adjacencies - 1 # maximum possible size of independent_set
effective_size_max = 1 # current maximum size found
# boolean to check that we increased the size of independent set. If not, stop algorithm:
adj_added = True
while effective_size_max < size_max_independent_set and adj_added:
effective_size_max += 1
adj_added = False
# For each adjacency set found in the last step
for cur_set in last_independent_sets:
# Try to add other adjacencies in the set...
for j in xrange(cur_set[-1] + 1, nb_adjacencies):
is_conflicting = False
# ...if not in conflict with any adjacency already in the set
for adj_id in cur_set:
if (adj_id, j) in conflicts or (j, adj_id) in conflicts:
is_conflicting = True
# if new adjacency 'j' is not in conflict with all adjs in cur_set, add it
if not is_conflicting:
adj_added = True
independent_sets.append(list(cur_set) + [j])
# If we added an adjacency to a set, we run a new step
# -> initialize sets for next step
if adj_added:
last_independent_sets = independent_sets
independent_sets = []
return last_independent_sets
[docs]def find_mincost_set(independent_sets, adjacencies_score):
""" Among all sets of compatible adjacencies of maximum size, find the one with minimum cost
Parameters
----------
independent_sets : list
List of sets of independent adjacencies IDs
adjacencies_score : list
List of scores of the adjacencies, such that adjacencies_score[*id*] = score of adjacency
with ID = *id*
Returns
-------
list
*maximum_set:* a list of compatible adjacencies, with minimum global cost
"""
maximum_set = []
min_cost = sys.maxint # maximum int value, not reachable by the cost
for cur_set in independent_sets:
cost = 0
for adj_id in cur_set:
cost += adjacencies_score[adj_id]
if cost < min_cost:
maximum_set = cur_set
min_cost = cost
return maximum_set
[docs]def change_adj_discarded_files(adj_prefix_file, step):
""" Save discarded adjacencies into a new file for information, and remove them from last file
Parameters
----------
adj_prefix_file : string
Prefix of adjacency file name
step : int
Current step of ProCars method
"""
last_adj_discard = adj_prefix_file + "_" + str(step) + ".txt_discarded.txt"
new_adj_discard = adj_prefix_file + "_" + str(step) + ".txt_discarded_used.txt"
# save discarded adjacencies into a new file with 'used' suffixe
shutil.move(last_adj_discard, new_adj_discard)
# empty discarded adjacencies file
open(last_adj_discard, "w").close()
[docs]def write_retained_adjs(adj_filename_prefix, adj_infos, maximum_set, adj_ids, step):
""" Write all adjacencies in the retained set of non-conflicting adjacencies (maximum size
and minimum cost) into a new adjacency file.
Parameters
----------
adj_filename_prefix : string
Prefix of adjacency file name
adj_infos : dict
Dictionary with adjacency IDs as keys, and values are a list with:
- the car adjacency (car1, car2)
- the type of adjacency
- the step at which it was found
- the presence of this adjacency in each species
maximum_set : list
List of retained adjacency ids
adj_ids : dict
Keys are adjacency IDs, and values are the adjacency corresponding to the ID
step : int
Current step of ProCars method
"""
last_adjs = adj_filename_prefix + "_" + str(step - 1) + ".txt"
new_adjs = adj_filename_prefix + "_" + str(step) + ".txt"
with open(last_adjs, "r") as last:
all_last_adjs = last.readlines()
utils_IO.write_retained_conflict_adjs(new_adjs, adj_infos, maximum_set, adj_ids)
with open(new_adjs, "a") as new:
new.writelines(all_last_adjs)
[docs]def main(adjacency_file_name, step, tree_bin):
"""
Main method.
From a set of conflicting conserved adjacencies with the set of species supporting each
adjacencies, it computes a maximum set of non-conflicting adjacencies.
Parameters
----------
adjacency_file_name : string
Prefix of the adjacency file
step : int
Current step of the ProCARs method
tree_bin : string
Name of the binary file containing all information about the species tree.
Returns
-------
tuple
*maximum_set:* list
List of retained adjacency ids (one of the independent_sets)
*independent_sets:* list
List of compatible adjacency sets. All sets have the same size,
which is the maximum possible size of an independent adjacencies set
"""
# Define filenames
conflicts_filename = adjacency_file_name + "_" + str(step) + ".txt_discarded.txt"
# Reading species tree file
[tree, ancestor_id, path, leaves, ordered_internals, nb_species,
_, _, _, spe_ids] = utils_IO.read_binary_file(tree_bin)
# Get conflicting adjacencies of last step
adj_res = get_adjacencies_costs(conflicts_filename, nb_species, leaves, tree, spe_ids,
path, ordered_internals, ancestor_id)
adjacencies_score, adj_ids, adj_infos = adj_res
# Find conflicting adjacencies two by two
conflicts = find_conflicts_couples(adj_ids)
# Find independant sets of adjacencies
independent_sets = find_independant_sets(len(adj_ids.keys()), conflicts)
# Computing the total cost of each maximum size independent set and choosing a
# minimum cost independent set
maximum_set = find_mincost_set(independent_sets, adjacencies_score)
# Writting the retained adjacencies
write_retained_adjs(adjacency_file_name, adj_infos, maximum_set, adj_ids, step)
change_adj_discarded_files(adjacency_file_name, step)
print("... " + str(len(maximum_set)) + " retained adjacencies\n")
return maximum_set, independent_sets