Source code for procars.step_modules.compute_resolved_conflicts

#!/usr/bin/env python
# -*- coding: utf-8 -*-:

"""
Copyright © Bonsai - LIFL (Université Lille 1, CNRS UMR 8022) and Inria-Lille Nord Europe

contact: aida.ouangraoua@inria.fr, amandine.perrin@inria.fr

This software is a computer program whose purpose is to progressively reconstruct ancestral
gene orders.

This software is governed by the CeCILL-B license under French law and
abiding by the rules of distribution of free software. You can use,
modify and/or redistribute the software under the terms of the CeCILL-B
license as circulated by CEA, CNRS and Inria at the following URL
http://www.cecill.info, or in the LICENCE file at the root directory of this program.

The fact that you are presently reading this means that you have had
knowledge of the CeCILL-B license and that you accept its terms.

---------------------------------------------------

``compute_resolved_conflicts`` **module description**:

This module is used to resolve conflicts in a given set of adjacencies, when, at the previous
step, the set of non-conflicting adjacencies was empty.


*Method:*

From a set of conflicting conserved adjacencies with the set of species supporting each
adjacencies, it computes a maximum set of non-conflicting adjacencies.
We use the Sankoffs algorithm's for the minimal mutation problem.

.. moduleauthor:: Aïda Ouangraoua, Amandine PERRIN

February 2014
"""


from procars.utils import utils_IO
import itertools
import sys
import shutil


[docs]def compute_cost(tree, path, leaves, internals, labels, nb_labels):
    """ Function computing optimal labels for the ancestor node in a tree

    Dynamic programming version of the Fitch Algorithm for the Small Parsimony Problem (Fitch 71)

    Parameters
    ----------
    tree: dict
        Tree structure:

        {node_id: [node_name, parent_id, left_child_id, right_child_id], ...}

        Where *id* = -1 if no parent/left_child/right_child

        And node_name = *"N" + node_id* for internal nodes, or *leaf_name* for leaf nodes
    path : list
        Array, associating value 1 to a node if it is on the path between ancestor and root node,
        otherwise value 0
    leaves : list
        List of leaf IDs in the tree
    internals : list
        Ordered list of internal node IDs in the tree
    labels : dict
        Containing labels of leaves nodes: 0 if adjacency is absent, 1 if present
    nb_labels : int
        Number of label types

    Returns
    -------
    list
        *tab:* costs for each node of the tree
    """
    nb_nodes = len(tree)
    max_cost = sys.maxint
    cost = 0
    tab = []
    for i in range(nb_nodes + 1):
        tab.append([])
        for label_id in range(nb_labels + 1):
            tab[i].append(0)
        tab[i][0] = max_cost
    for leaf in leaves:
        if(len(labels[leaf]) > 0):
            for label_id in range(1, nb_labels + 1):
                if(label_id in labels[leaf]):
                    tab[leaf][label_id] = 0
                else:
                    tab[leaf][label_id] = max_cost
    for node_id in internals:
        child1_id = tree[node_id][2]
        child2_id = tree[node_id][3]
        parent_id = tree[node_id][1]
        for label_id in range(1, nb_labels + 1):
            cost = 0
            if(path[child1_id] != 1):
                cost += min([min(tab[child1_id]) + 1, tab[child1_id][label_id]])
            if(path[child2_id] != 1):
                cost += min([min(tab[child2_id]) + 1, tab[child2_id][label_id]])
            if(path[node_id] == 1):
                if(parent_id != -1):
                    cost += min([min(tab[parent_id]) + 1, tab[parent_id][label_id]])
            tab[node_id][label_id] = cost
    return tab


[docs]def get_adjacencies_costs(adj_file, nb_species, leaves, tree, spe_ids,
                          path, internals, ancestor_id):
    """ Read the file where discarded adjacencies were stored, and calculate the cost of each
    of these adjacencies

    Parameters
    ----------
    adj_file : string
        File in which discarded adjacencies are written
    nb_species : int
        Total number of species
    leaves : list
        List of IDs of tree leaves (= genomes)
    tree : dict
        Tree structure

        {node_id: [node_name, parent_id, left_child_id, right_child_id], ...}

        Where *id* = -1 if no parent/left_child/right_child

        And node_name = *"N" + node_id* for internal nodes, or *leaf_name* for leaf nodes
    spe_ids : dict
        Species as keys, and their corresponding ID as value
    path : list
        Array, associating value 1 to a node if it is on the path between ancestor and root node,
        otherwise value 0
    internals : list
        IDs of internal nodes of the tree
    ancestor_id : int
        ID of the ancestor node

    Returns
    -------
    tuple
        - *adjacencies_score:* dict such that adjacencies_score[id] = score
          of the adjacency with ID *id*

        - *adj_ids:* dict for corresponding adjacencies and adjacency IDs:
          adj_ids[id] = (bloc1, bloc2)

        - *step_car_adjs:* dict such that step_car_adjs[id] = (car1, car2, type, step, labels)

          where *car1, car2* corresponds to the car adjacency corresponding
          to the adjacency with ID *id*, *type* is the type of adjacency,
          and *step* is the step at which the adjacency was found, and *labels* is some
          information about the presence (1) or absence (0) of the adjacency in each species.
    """
    adjacencies_score = {}
    adj_ids = {}
    step_car_adjs = {}
    iterator = utils_IO.read_conflict_adj_file(adj_file, nb_species, leaves, tree, spe_ids)
    for labels, adj_id, adj, step_car_adj in iterator:
        tab = compute_cost(tree, path, leaves, internals, labels, 2)
        cost = tab[ancestor_id][2]
        adjacencies_score[adj_id] = cost
        adj_ids[adj_id] = adj
        step_car_adjs[adj_id] = step_car_adj
    return adjacencies_score, adj_ids, step_car_adjs


[docs]def find_conflicts_couples(adj_ids):
    """ Find couples of adjacencies in conflict

    Among all possible couples of adjacencies, finds the ones which involve a common end.

    Parameters
    ----------
    adj_ids : list
        List of IDs such that adj_ids[id] = (bloc1, bloc2) with bloc1 and bloc2 the two block
        numbers of the adjacency with id 'id'

    Returns
    -------
    tuple
        *conflicts:* a list containing all couples of adjacencies IDs which are in conflict (because
        they share one end).
    """
    conflicts = []
    non_conflicts = []
    # For each couple of two different adjacencies, check if they share a same end
    for adj_couple_ids in itertools.combinations(adj_ids, 2):
        adj_couple = (adj_ids[adj_couple_ids[0]], adj_ids[adj_couple_ids[1]])
        (i1, i2), (j1, j2) = adj_couple
        if(i1 == j1 or i1 == -j2 or i2 == -j1 or i2 == j2):
            conflicts.append(adj_couple_ids)
        else:
            non_conflicts.append(adj_couple_ids)
    return conflicts


[docs]def find_independant_sets(nb_adjacencies, conflicts):
    """ Find sets of independent adjacencies

    From the list of conflicting adjacencies, it tries to create sets by adding new adjacencies
    in each set, which are compatible with all adjacencies already in the set.
    The set of independent (= not in conflict) adjacencies is then growing at each iteration,
    until we cannot add any more adjacency.

    Parameters
    ----------
    nb_adjacencies : int
        Total number of adjacencies
    conflicts : int
        List of pairs of adjacencies IDs which are in conflict

    Returns
    -------
    list
        *independent_sets:* list of sets of compatible adjacencies. All sets have the same size,
        which is the maximum possible size of an independent adjacencies set
    """
    independent_sets = []    # array storing all independent_sets
    # independent_sets found at last step:
    last_independent_sets = [[i] for i in xrange(nb_adjacencies)]
    size_max_independent_set = nb_adjacencies - 1    # maximum possible size of independent_set
    effective_size_max = 1   # current maximum size found
    # boolean to check that we increased the size of independent set. If not, stop algorithm:
    adj_added = True
    while effective_size_max < size_max_independent_set and adj_added:
        effective_size_max += 1
        adj_added = False
        # For each adjacency set found in the last step
        for cur_set in last_independent_sets:
            # Try to add other adjacencies in the set...
            for j in xrange(cur_set[-1] + 1, nb_adjacencies):
                is_conflicting = False
                # ...if not in conflict with any adjacency already in the set
                for adj_id in cur_set:
                    if (adj_id, j) in conflicts or (j, adj_id) in conflicts:
                        is_conflicting = True
                # if new adjacency 'j' is not in conflict with all adjs in cur_set, add it
                if not is_conflicting:
                    adj_added = True
                    independent_sets.append(list(cur_set) + [j])
        # If we added an adjacency to a set, we run a new step
        # -> initialize sets for next step
        if adj_added:
            last_independent_sets = independent_sets
            independent_sets = []
    return last_independent_sets


[docs]def find_mincost_set(independent_sets, adjacencies_score):
    """ Among all sets of compatible adjacencies of maximum size, find the one with minimum cost

    Parameters
    ----------
    independent_sets : list
        List of sets of independent adjacencies IDs
    adjacencies_score : list
        List of scores of the adjacencies, such that adjacencies_score[*id*] = score of adjacency
        with ID = *id*

    Returns
    -------
    list
        *maximum_set:* a list of compatible adjacencies, with minimum global cost
    """
    maximum_set = []
    min_cost = sys.maxint   # maximum int value, not reachable by the cost
    for cur_set in independent_sets:
        cost = 0
        for adj_id in cur_set:
            cost += adjacencies_score[adj_id]
        if cost < min_cost:
            maximum_set = cur_set
            min_cost = cost
    return maximum_set


[docs]def change_adj_discarded_files(adj_prefix_file, step):
    """ Save discarded adjacencies into a new file for information, and remove them from last file

    Parameters
    ----------
    adj_prefix_file : string
        Prefix of adjacency file name
    step : int
        Current step of ProCars method
    """
    last_adj_discard = adj_prefix_file + "_" + str(step) + ".txt_discarded.txt"
    new_adj_discard = adj_prefix_file + "_" + str(step) + ".txt_discarded_used.txt"
    # save discarded adjacencies into a new file with 'used' suffixe
    shutil.move(last_adj_discard, new_adj_discard)
    # empty discarded adjacencies file
    open(last_adj_discard, "w").close()


[docs]def write_retained_adjs(adj_filename_prefix, adj_infos, maximum_set, adj_ids, step):
    """ Write all adjacencies in the retained set of non-conflicting adjacencies (maximum size
    and minimum cost) into a new adjacency file.

    Parameters
    ----------
    adj_filename_prefix : string
        Prefix of adjacency file name
    adj_infos : dict
        Dictionary with adjacency IDs as keys, and values are a list with:
          - the car adjacency (car1, car2)
          - the type of adjacency
          - the step at which it was found
          - the presence of this adjacency in each species
    maximum_set : list
        List of retained adjacency ids
    adj_ids : dict
        Keys are adjacency IDs, and values are the adjacency corresponding to the ID
    step : int
        Current step of ProCars method
    """
    last_adjs = adj_filename_prefix + "_" + str(step - 1) + ".txt"
    new_adjs = adj_filename_prefix + "_" + str(step) + ".txt"
    with open(last_adjs, "r") as last:
        all_last_adjs = last.readlines()
    utils_IO.write_retained_conflict_adjs(new_adjs, adj_infos, maximum_set, adj_ids)
    with open(new_adjs, "a") as new:
        new.writelines(all_last_adjs)


[docs]def main(adjacency_file_name, step, tree_bin):
    """
    Main method.
    From a set of conflicting conserved adjacencies with the set of species supporting each
    adjacencies, it computes a maximum set of non-conflicting adjacencies.

    Parameters
    ----------
    adjacency_file_name : string
        Prefix of the adjacency file
    step : int
        Current step of the ProCARs method
    tree_bin : string
        Name of the binary file containing all information about the species tree.

    Returns
    -------
    tuple
        *maximum_set:* list
        List of retained adjacency ids (one of the independent_sets)

        *independent_sets:* list
        List of compatible adjacency sets. All sets have the same size,
        which is the maximum possible size of an independent adjacencies set
    """
    # Define filenames
    conflicts_filename = adjacency_file_name + "_" + str(step) + ".txt_discarded.txt"
    # Reading species tree file
    [tree, ancestor_id, path, leaves, ordered_internals, nb_species,
     _, _, _, spe_ids] = utils_IO.read_binary_file(tree_bin)
    # Get conflicting adjacencies of last step
    adj_res = get_adjacencies_costs(conflicts_filename, nb_species, leaves, tree, spe_ids,
                                    path, ordered_internals, ancestor_id)
    adjacencies_score, adj_ids, adj_infos = adj_res
    # Find conflicting adjacencies two by two
    conflicts = find_conflicts_couples(adj_ids)
    # Find independant sets of adjacencies
    independent_sets = find_independant_sets(len(adj_ids.keys()), conflicts)
    # Computing the total cost of each maximum size independent set and choosing a
    # minimum cost independent set
    maximum_set = find_mincost_set(independent_sets, adjacencies_score)
    # Writting the retained adjacencies
    write_retained_adjs(adjacency_file_name, adj_infos, maximum_set, adj_ids, step)
    change_adj_discarded_files(adjacency_file_name, step)
    print("... " + str(len(maximum_set)) + " retained adjacencies\n")
    return maximum_set, independent_sets
Navigation

Source code for procars.step_modules.compute_resolved_conflicts

Quick search

Navigation