Source code for procars.utils.initial_files_reader

#!/usr/bin/env python
# -*- coding: utf-8 -*-:

"""
Copyright © Bonsai - LIFL (Université Lille 1, CNRS UMR 8022) and Inria-Lille Nord Europe

contact: aida.ouangraoua@inria.fr, amandine.perrin@inria.fr

This software is a computer program whose purpose is to progressively reconstruct ancestral
gene orders.

This software is governed by the CeCILL-B license under French law and
abiding by the rules of distribution of free software. You can use,
modify and/or redistribute the software under the terms of the CeCILL-B
license as circulated by CEA, CNRS and Inria at the following URL
http://www.cecill.info, or in the LICENCE file at the root directory of this program.

The fact that you are presently reading this means that you have had
knowledge of the CeCILL-B license and that you accept its terms.

---------------------------------------------------

``initial_files_reader`` **module description**:

This module contains some methods used to read and parse the initial files:

    * Phylogeny tree file (where a node is marked as the ancestor: @)
    * Blocks file (with block order of each genome of the tree)


.. moduleauthor:: Aïda Ouangraoua, Amandine PERRIN

May 2014

"""


import re


[docs]def read_treefile(tree_file_name):
    """ Function reading the tree structure from a file

    Parameters
    ----------
    tree_file_name : String
        Name of file containing species tree

    Returns
    -------
    tuple
        *tree:* dict, tree structure:
              |  {node_id: [node_name, parent_id, left_child_id, right_child_id], ...}
              |  where *id* = -1 if no *parent/left_child/right_child*
              |  and *node_name* = "N" + *node_id* for internal nodes, or *leaf_name* for leaf nodes
        *ancestor_id:* int, id of ancestor node in tree
    """
    ancestor_id = -1    # integer, identifier of ancestor node to be reconstructed
    tree = {}           # a map for the tree structure (see docstring hereabove)
    with open(tree_file_name, "r") as treefile:
        treeseq = treefile.readline()
    cur_id = 0    # ids to be associated to nodes
    pile = []     # pile used to remember ancestor nodes
    pos = 0       # integer containing successive positions in the string treeseq
    while pos < len(treeseq):
        # start to read a new internal node
        if treeseq[pos] == "(":
            cur_id += 1
            tree[cur_id] = ["N" + str(cur_id), -1, -1, -1]
            # if node has a parent
            if len(pile) > 0:
                # Add parent ID in the current node array (index 1)
                tree[cur_id][1] = pile[-1]
                # Add current node as a child to his parent node.
                if(tree[pile[-1]][2] == -1):
                    tree[pile[-1]][2] = cur_id
                else:
                    tree[pile[-1]][3] = cur_id
            # add node on top of the pile
            pile.append(cur_id)
            pos += 1
        # end of node
        elif treeseq[pos] == ")":
            pos += 1
            # if node is the ancestor node to be reconstructed, save its ID
            if treeseq[pos] == "@":
                ancestor_id = pile[-1]
                pos += 1
            # check if end of line is reached
            if treeseq[pos] == ";" or treeseq[pos] == "\n" or treeseq == " ":
                pos += 2
            del pile[-1]
        # new sibling
        elif treeseq[pos] == ",":
            pos += 1
        #  leaf node
        else:
            # Give node a new ID
            cur_id += 1
            tree[cur_id] = ["", -1, -1, -1]
            # if node as a parent
            if len(pile) > 0:
                # add parent at position 1 in array
                tree[cur_id][1] = pile[-1]
                # add current node id in children position of parent array
                if(tree[pile[-1]][2] == -1):
                    tree[pile[-1]][2] = cur_id
                else:
                    tree[pile[-1]][3] = cur_id
            # add leaf species name at position 0 in the array
            begin = pos
            while(pos < len(treeseq) and treeseq[pos] != "," and treeseq[pos] != ")"):
                pos += 1
            nom = treeseq[begin: pos]
            nom = "_".join(nom.split())  # Remove spaces in leaf name if there are
            tree[cur_id][0] = nom
    return tree, ancestor_id


[docs]def read_block_file(block_file_name, nb_species, spe_ids):
    """ Method reading blocks from a block file

    Parameters
    ----------
    block_file_name : String
        Name of file containing the ordered blocks for all species
    nb_species : int
        Total number of species
    spe_ids : dict
        Species and their corresponding ID {spe1: id1, spe2: id2, ...}

    Returns
    -------
    list
        *blocks:* array of arrays (blocks)
            |  blocks = [[*list of blocks for spe_id=0*], [*list of blocks for spe_id=1*], ...]
            |  with *list of blocks for spe_id=1* = [[*list of blocks for spe_id=1, chromosome 1*],
                                                     [*list of blocks for spe_id=1, chromosome 2*],
                                                     ...
                                                    ]
            |  with *list of blocks for spe_id=1, chromosome 1* = [[start, end, num_bloc, orient],
                                                                   [start, end, num_bloc, orient],
                                                                    ... for the n blocks
                                                                  ]
        *nb_blocks:* int, total number of blocks
    """
    blocks = [[] for _ in xrange(nb_species)]   # array containing arrays (one for each species)
    with open(block_file_name, "r") as blocfile:
        chm = {}   # map : chr_name -> integer_id
        chm['X'] = 0
        bloc_id = 0    # Current bloc id
        for line in blocfile:
            chm, bloc_id = read_block_line(line, chm, spe_ids, blocks, bloc_id)
    nb_blocks = bloc_id
    return blocks, nb_blocks


[docs]def read_block_line(line, chm, spe_ids, blocks, bloc_id):
    """ Reading one line of a block file, in order to return a new block number, or
    the positions of a block in a species chromosome.

    Parameters
    ----------
    line : String
        A line of the block file
    chm : dict
        Chromosome IDs : *{chromo: id, chromo2: id2, ...}*
    spe_ids : dict
        Species and their corresponding ID *{spe1: id1, spe2: id2, ...}*
    blocks : list
        Array of blocks already read, to complete
    bloc_id : int
        Current bloc id

    Returns
    -------
    list
        *chm:* dict, chromosome IDs : *{chromo: id, chromo2: id2, ...}*

        *bloc_id:* int, current bloc id
    """
    # New block
    if line[0] == '>':
        # find line index of '>' (if not first caracter)
        end = [m.end() for m in re.finditer('>', line)]
        bloc_id = int(line[end[0]:].strip())
    # Get block positions for one species
    elif len(line.strip()) > 0:
        spe = line.strip().split(".")[0]
        # if the species exists in our list of species ids, add to block information
        if spe in spe_ids:
            chromo = line.strip().split(".")[-1].split(":")[0]
            start = int(line.split(":")[-1].split("-")[0].strip())
            end = int(line.split("-")[1].split()[0].strip())
            orient = line.strip().split()[-1]
            orient = 1 if orient == "+" else -1
            if not chromo.upper() in chm:
                chm[chromo.upper()] = max(chm.values()) + 1
            if (chm[chromo.upper()] >= len(blocks[spe_ids[spe]])):
                nb_chr_add = chm[chromo.upper()] - len(blocks[spe_ids[spe]]) + 1
                for _ in xrange(nb_chr_add):
                    blocks[spe_ids[spe]].append([])
            blocks[spe_ids[spe]][chm[chromo.upper()]].append([start, end, bloc_id, orient])
    return chm, bloc_id
Navigation

Source code for procars.utils.initial_files_reader

Quick search

Navigation