Source code for procars.step_modules.compute_initialpqtree

#!/usr/bin/env python
# -*- coding: utf-8 -*-:

"""
Copyright © Bonsai - LIFL (Université Lille 1, CNRS UMR 8022) and Inria-Lille Nord Europe

contact: aida.ouangraoua@inria.fr, amandine.perrin@inria.fr

This software is a computer program whose purpose is to progressively reconstruct ancestral
gene orders.

This software is governed by the CeCILL-B license under French law and
abiding by the rules of distribution of free software. You can use,
modify and/or redistribute the software under the terms of the CeCILL-B
license as circulated by CEA, CNRS and Inria at the following URL
http://www.cecill.info, or in the LICENCE file at the root directory of this program.

The fact that you are presently reading this means that you have had
knowledge of the CeCILL-B license and that you accept its terms.

---------------------------------------------------

``compute_initialpqtree`` **module description**:

This module reads the source files (tree file and blocks file), stores information,
and fills a CAR file with an initial set of CARs, each composed of a single block.

It needs, as input :

 * a species tree where a node is marked as the ancestor: @
 * a set of universal synteny blocks
 * a CAR file name (in which initial PQtree will be saved)

.. moduleauthor:: Aïda Ouangraoua, Amandine PERRIN

February 2014

"""
from procars.utils import utils_IO
from procars.utils import initial_files_reader as read_init


[docs]def mark_ancestors(tree, ancestor_id):
    """ Method marking tree nodes which are on the path between the ancestor node and the root node

    Parameters
    ----------
    tree : dict
        Tree structure :

        {node_id: [node_name, parent_id, left_child_id, right_child_id], ...}

        where id = -1 if no parent/left_child/right_child
        and node_name = *"N"* + *node_id* for internal nodes, or *leaf_name* for leaf nodes
    ancestor_id : int
        ID of ancestor node in tree

    Returns
    -------
    list
        0-1 array such that path[node_id] = 1 if node is on the path between ancestor node and root,
        and otherwise path[node_id] = 0
    """
    nb_nodes = len(tree.keys())  # number of nodes in tree
    path = [0 for _ in range(nb_nodes + 1)]
    cur_node = ancestor_id
    while(cur_node != -1):
        path[cur_node] = 1
        cur_node = tree[cur_node][1]   # Next node is the parent of current node
    return path


[docs]def list_leaves_internals(tree, root_id, path):
    """ Function computing a list of leaves and a postfix ordered list of internal nodes for a tree

    Recursive algorithm visiting nodes from ancestor node to leaves, in a prefix order.

    Parameters
    ----------
    tree : dict
        Tree structure :

        {node_id: [node_name, parent_id, left_child_id, right_child_id], ...}

        where id = -1 if no parent/left_child/right_child
        and node_name = *"N" + node_id* for internal nodes, or *leaf_name* for leaf nodes
    root_id : int
        The node considered as the new root of the tree
    path : list
        0-1 array such that path[node_id] = 1 if node is on the path between ancestor node and root,
        and otherwise path[node_id]=0

    Returns
    -------
    list
        *leaves_internals:* list containing two lists: a list of leaves ids and an ordered list of
        internal node ids
    """
    leaves_internals = [[], []]   # array, (leave ids array, internal node ids array)
    # if id is a leaf
    if(tree[root_id][2] == -1 and tree[root_id][3] == -1):
        # add it in leaves array
        leaves_internals = [[root_id], []]
    else:
        # add it in internal nodes array
        leaves_internals = [[], [root_id]]
        # recover child1, child2, and parent id
        child1_id = tree[root_id][2]
        child2_id = tree[root_id][3]
        parent_id = tree[root_id][1]
        # add new leaves at the end of leaves array
        # add new internal node at the beginning of internal nodes array
        # recover list of leaves, and list of internal nodes from child1
        if(path[child1_id] != 1):
            leaves_internals_child1 = list_leaves_internals(tree, child1_id, path)
            leaves_internals[0] = leaves_internals[0] + leaves_internals_child1[0]
            leaves_internals[1] = leaves_internals_child1[1] + leaves_internals[1]
        # recover list of leaves, and list of internal nodes from child2
        if(path[child2_id] != 1):
            leaves_internals_child2 = list_leaves_internals(tree, child2_id, path)
            leaves_internals[0] = leaves_internals[0] + leaves_internals_child2[0]
            leaves_internals[1] = leaves_internals_child2[1] + leaves_internals[1]
        # recover list of leaves, and list of internal nodes from parent
        if(path[root_id] == 1):
            if(parent_id != -1):
                leaves_internals_parent = list_leaves_internals(tree, parent_id, path)
                leaves_internals[0] = leaves_internals[0] + leaves_internals_parent[0]
                leaves_internals[1] = leaves_internals_parent[1] + leaves_internals[1]
    return leaves_internals


[docs]def compute_parameters_from_tree(tree_file_name):
    """ Function computing parameters from a tree file

    Parameters
    ----------
    tree_file_name : String
        Name of file containing the species tree to read to extract parameters

    Returns
    -------
    tuple
        *tree:* dict, tree structure

        *ancestor_id:* int, id of ancestor node in tree

        *path:* list, array of 0 and 1 indicating nodes that are ancestor of ancestor node

        *leaves:* list, array of integers containing a list of leave ids

        *internals:* list, integer array containing an ordered list of internal nodes ids in a
        postfix order with ancestor node as root
    """
    tree, ancestor_id = read_init.read_treefile(tree_file_name)
    path = mark_ancestors(tree, ancestor_id)
    leaves, ordered_internals = list_leaves_internals(tree, ancestor_id, path)
    return tree, ancestor_id, path, leaves, ordered_internals


[docs]def partition_species(species1, species2, species3, tree, ancestor_id, cur_id, partition_id):
    """ Function computing the partition of species defined by the ancestor node

    Recursive algorithm

    Parameters
    ----------
    species1 : list
        Species names in group 1 array to be filled
    species2 : list
        Species names in group 2 array to be filled
    species3 : list
        Species names in group 3 array to be filled
    tree : dict
        Tree structure :

        {node_id: [node_name, parent_id, left_child_id, right_child_id], ...}

        Where *id* = -1 if no parent/left_child/right_child
        And node_name = *"N" + node_id* for internal nodes, or *leaf_name* for leaf nodes
    ancestor_id : int
        ID of ancestor node in tree
    cur_id : int
        ID of current node during the tree transversal
    partition_id : int
        ID of current group in partition

    Returns
    -------
    list
        *species1:* String array, names of group1 species

        *species2:* String array, names of group2 species

        *species3:* String array, names of group3 species
    """
    child1 = tree[cur_id][2]
    child2 = tree[cur_id][3]
    # if node is leaf (species)
    spe_groups = [species1, species2, species3]
    if(child1 == -1 and child2 == -1):
        spe_name = tree[cur_id][0]
        spe_groups[partition_id - 1].append(spe_name)
    # if node is an internal node
    else:
        # if node is the ancestor
        if(cur_id == ancestor_id):
            partition_species(species1, species2, species3, tree, ancestor_id, child1, 2)
            partition_species(species1, species2, species3, tree, ancestor_id, child2, 3)
        # if node is not the ancestor
        else:
            partition_species(species1, species2, species3, tree, ancestor_id, child1, partition_id)
            partition_species(species1, species2, species3, tree, ancestor_id, child2, partition_id)
    return species1, species2, species3


[docs]def compute_partition_and_species_id(tree, ancestor_id):
    """ Function calling the method partitionning the tree into 3 groups defined by the
    ancestor, and determining the 3 groups of species IDs, as well as the list of species with
    their corresponding ID.

    Parameters
    ----------
    tree : dict
        Tree structure :

        {node_id: [node_name, parent_id, left_child_id, right_child_id], ...}

        Where *id* = -1 if no parent/left_child/right_child
        And node_name = *"N" + node_id* for internal nodes, or *leaf_name* for leaf nodes
    ancestor_id : int
        ID of ancestor node in tree

    Returns
    -------
    tuple
        *species1_id:* integer array, ids of group1 species

        *species2_id:* integer array, ids of group2 species

        *species3_id:* integer array, ids of group3 species

        *spe_ids:* map such that spe_ids[spe_name] = spe_id
    """
    species1 = []   # array of output species names
    species2 = []   # array of input1 species names
    species3 = []   # array of input2 species names
    partition_species(species1, species2, species3, tree, ancestor_id, 1, 1)
    spe_ids = {spe: cur_id for cur_id, spe in enumerate(species1 + species2 + species3)}
    species1_id = [spe_ids[spe] for spe in species1]
    species2_id = [spe_ids[spe] for spe in species2]
    species3_id = [spe_ids[spe] for spe in species3]
    return species1_id, species2_id, species3_id, spe_ids


[docs]def commons_read_tree(tree_file, binary_file_name):
    """
    Reading, and computing parameters and partitions from a tree file, and saving values computed
    in a binary file.

    Parameters
    ----------
    tree_file_name : String
        Name of file containing species tree
    binary_file_name : String
        Name of file where results must be stored

    Returns
    -------
    list
        *nb_species:* int, total number of species

        *spe_ids:* dict, {species1: id1, species2: id2, ...}
    """
    # read and compute parameters from species tree file
    tree, ancestor_id, path, leaves, ordered_internals = compute_parameters_from_tree(tree_file)
    nb_species = len(leaves)  # number of species in tree
    # Ancestor node defines a partition of species into 3 groups: determine it
    res = compute_partition_and_species_id(tree, ancestor_id)
    species1_id, species2_id, species3_id, spe_ids = res
    # save computed values in a binary file
    values_saved = [tree, ancestor_id, path, leaves, ordered_internals, nb_species,
                    species1_id, species2_id, species3_id, spe_ids]
    utils_IO.save_binary_information(binary_file_name, values_saved)
    return nb_species, spe_ids


[docs]def compute_blocks_neighbors(nb_blocks, nb_species, blocks):
    """ Function computing block neighbors

    Parameters
    ----------
    nb_blocks : int
        Total number of blocks
    nb_species : int
        Total number of species
    blocks : list
        List of lists

        blocks = [[*list of blocks for spe_id=0*], [*list of blocks for spe_id=1*], ...]

        with *list of blocks for spe_id=1* =
        [[*list of blocks for spe_id=1, chromosome 1*],
        [*list of blocks for spe_id=1, chromosome 2*],
        ...]

        with *list of blocks for spe_id=1, chromosome 1* =
        [[start, end, num_bloc, orient],
        [start, end, num_bloc, orient],
        ... for the n blocks]

    Returns
    -------
    tuple
        *left :* array such that left[spe_id][block_id] is an array containing the left
        neighbor of block block_id in spe_id

        *right :* array such that right[spe_id][block_id] is an array containing the right
        neighbor of block_id in spe_id

    """
    left = []
    # left = for each species : [[0, block_before0], [0, block_before1], [0, block_before2], ...]
    right = []
    # right = for each species : [[0], [0, block_after1], [0, block_after2], ...]
    # add cells of arrays left and right
    for spe_id in range(nb_species):
        left.append([])
        right.append([])
        for _ in range(nb_blocks + 1):
            left[spe_id].append([0])
            right[spe_id].append([0])
        for chr_id in range(len(blocks[spe_id])):
            # sort blocks by start position
            blocks[spe_id][chr_id].sort()
    # for each species
    for spe_id in range(nb_species):
        # for each chromosomes in species
        for chr_id in range(len(blocks[spe_id])):
            last_block_id = 0
            last_block_dir = 0
            # for each block in chromosome
            for signed_block in blocks[spe_id][chr_id]:
                block_id = signed_block[2]
                block_dir = signed_block[3]
                # compute left and right neighbor
                if(block_dir == 1):
                    left[spe_id][block_id].append(last_block_id * last_block_dir)
                elif(block_dir == -1):
                    right[spe_id][block_id].append(-last_block_id * last_block_dir)
                if(last_block_dir == 1):
                    right[spe_id][last_block_id].append(block_id * block_dir)
                elif(last_block_dir == -1):
                    left[spe_id][last_block_id].append(-block_id * block_dir)
                last_block_id = block_id
                last_block_dir = block_dir
            if(last_block_dir == 1):
                right[spe_id][last_block_id].append(0)
            else:
                left[spe_id][last_block_id].append(0)
    return left, right


[docs]def commons_read_blocks(block_file_name, nb_species, spe_ids, binary_file_name):
    """ Function reading blocks from a block file and computing block neighbors. Saves all
    information about blocks into a binary file, used in the next steps

    Parameters
    ----------
    block_file_name : String
        Name of file containing the ordered blocks for all species
    nb_species : int
        Total number of species
    spe_ids : dict
        Species and their corresponding ID {spe1: id1, spe2: id2, ...}
    binary_file_name : String
        Name of the file in which saving all information about the blocks

    Returns
    -------
    int
        number of blocks

    """
    # read blocks file
    blocks, nb_blocks = read_init.read_block_file(block_file_name, nb_species, spe_ids)
    # Find previous and next blocks
    left, right = compute_blocks_neighbors(nb_blocks, nb_species, blocks)
    # save computed values in a binary file
    values_saved = [blocks, nb_blocks, left, right]
    utils_IO.save_binary_information(binary_file_name, values_saved)
    return nb_blocks


[docs]def write_init_cars_file(car_file_name, nb_blocks):
    """ Creates a CAR file, with an initial set of CARs, each composed of a single block

    Parameters
    ----------
    car_file_name : String
        File in which saving the CARs
    nb_blocks : int
        Total number of blocks = number of CARs in the output file
    """
    with open(car_file_name, "w") as output_file:
        for num in range(1, nb_blocks + 1):
            output_file.write("#CAR" + str(num) + "\n")
            output_file.write("_Q " + str(num) + " Q_\n")


[docs]def main(tree_file_name, block_file_name, car_file_name, tree_bin, blocks_bin, adj_file):
    """
    Main method.

    Reads tree file and block file, stores information into binary files, and creates a
    CAR file with an initial set of CARs, each composed of a single block. Also creates an
    empty adjacency file (no adjacency added at step 0).

    Parameters
    ----------
    tree_file_name : string
        Name of the species tree to read
    block_file_name : string
        Name of the blocks file to read
    car_file_name : string
        Name of PQtree file in which saving the initial set of CARs
    tree_bin : string
        Name of binary file in which saving all information found in tree file
    blocks_bin : string
        Name of binary file in which saving all information found in blocks file
    adj_file : string
        Name of adjacency file in which saving adjacencies added at step0
        (= no adjacency -> empty file)

    Returns
    -------
    int
        total number of blocks
    """
    # Step1 : reading tree file
    nb_species, species_ids = commons_read_tree(tree_file_name, tree_bin)
    # Step2 : reading blocks file
    nb_blocks = commons_read_blocks(block_file_name, nb_species, species_ids, blocks_bin)
    # Step3 : Writing CARs
    write_init_cars_file(car_file_name, nb_blocks)
    # Write adjacency file for step0: empty adjacency file
    open(adj_file, 'a').close()
    open(adj_file + "_discarded.txt", "a").close()
    print("... " + str(nb_blocks) + " cars")
    return nb_blocks
Navigation

Source code for procars.step_modules.compute_initialpqtree

Quick search

Navigation