Source code for procars.utils.initial_files_reader

#!/usr/bin/env python
# -*- coding: utf-8 -*-:

"""
Copyright © Bonsai - LIFL (Université Lille 1, CNRS UMR 8022) and Inria-Lille Nord Europe

contact: aida.ouangraoua@inria.fr, amandine.perrin@inria.fr

This software is a computer program whose purpose is to progressively reconstruct ancestral
gene orders.

This software is governed by the CeCILL-B license under French law and
abiding by the rules of distribution of free software. You can use,
modify and/or redistribute the software under the terms of the CeCILL-B
license as circulated by CEA, CNRS and Inria at the following URL
http://www.cecill.info, or in the LICENCE file at the root directory of this program.

The fact that you are presently reading this means that you have had
knowledge of the CeCILL-B license and that you accept its terms.

---------------------------------------------------

``initial_files_reader`` **module description**:

This module contains some methods used to read and parse the initial files:

    * Phylogeny tree file (where a node is marked as the ancestor: @)
    * Blocks file (with block order of each genome of the tree)


.. moduleauthor:: Aïda Ouangraoua, Amandine PERRIN

May 2014

"""


import re


[docs]def read_treefile(tree_file_name): """ Function reading the tree structure from a file Parameters ---------- tree_file_name : String Name of file containing species tree Returns ------- tuple *tree:* dict, tree structure: | {node_id: [node_name, parent_id, left_child_id, right_child_id], ...} | where *id* = -1 if no *parent/left_child/right_child* | and *node_name* = "N" + *node_id* for internal nodes, or *leaf_name* for leaf nodes *ancestor_id:* int, id of ancestor node in tree """ ancestor_id = -1 # integer, identifier of ancestor node to be reconstructed tree = {} # a map for the tree structure (see docstring hereabove) with open(tree_file_name, "r") as treefile: treeseq = treefile.readline() cur_id = 0 # ids to be associated to nodes pile = [] # pile used to remember ancestor nodes pos = 0 # integer containing successive positions in the string treeseq while pos < len(treeseq): # start to read a new internal node if treeseq[pos] == "(": cur_id += 1 tree[cur_id] = ["N" + str(cur_id), -1, -1, -1] # if node has a parent if len(pile) > 0: # Add parent ID in the current node array (index 1) tree[cur_id][1] = pile[-1] # Add current node as a child to his parent node. if(tree[pile[-1]][2] == -1): tree[pile[-1]][2] = cur_id else: tree[pile[-1]][3] = cur_id # add node on top of the pile pile.append(cur_id) pos += 1 # end of node elif treeseq[pos] == ")": pos += 1 # if node is the ancestor node to be reconstructed, save its ID if treeseq[pos] == "@": ancestor_id = pile[-1] pos += 1 # check if end of line is reached if treeseq[pos] == ";" or treeseq[pos] == "\n" or treeseq == " ": pos += 2 del pile[-1] # new sibling elif treeseq[pos] == ",": pos += 1 # leaf node else: # Give node a new ID cur_id += 1 tree[cur_id] = ["", -1, -1, -1] # if node as a parent if len(pile) > 0: # add parent at position 1 in array tree[cur_id][1] = pile[-1] # add current node id in children position of parent array if(tree[pile[-1]][2] == -1): tree[pile[-1]][2] = cur_id else: tree[pile[-1]][3] = cur_id # add leaf species name at position 0 in the array begin = pos while(pos < len(treeseq) and treeseq[pos] != "," and treeseq[pos] != ")"): pos += 1 nom = treeseq[begin: pos] nom = "_".join(nom.split()) # Remove spaces in leaf name if there are tree[cur_id][0] = nom return tree, ancestor_id
[docs]def read_block_file(block_file_name, nb_species, spe_ids): """ Method reading blocks from a block file Parameters ---------- block_file_name : String Name of file containing the ordered blocks for all species nb_species : int Total number of species spe_ids : dict Species and their corresponding ID {spe1: id1, spe2: id2, ...} Returns ------- list *blocks:* array of arrays (blocks) | blocks = [[*list of blocks for spe_id=0*], [*list of blocks for spe_id=1*], ...] | with *list of blocks for spe_id=1* = [[*list of blocks for spe_id=1, chromosome 1*], [*list of blocks for spe_id=1, chromosome 2*], ... ] | with *list of blocks for spe_id=1, chromosome 1* = [[start, end, num_bloc, orient], [start, end, num_bloc, orient], ... for the n blocks ] *nb_blocks:* int, total number of blocks """ blocks = [[] for _ in xrange(nb_species)] # array containing arrays (one for each species) with open(block_file_name, "r") as blocfile: chm = {} # map : chr_name -> integer_id chm['X'] = 0 bloc_id = 0 # Current bloc id for line in blocfile: chm, bloc_id = read_block_line(line, chm, spe_ids, blocks, bloc_id) nb_blocks = bloc_id return blocks, nb_blocks
[docs]def read_block_line(line, chm, spe_ids, blocks, bloc_id): """ Reading one line of a block file, in order to return a new block number, or the positions of a block in a species chromosome. Parameters ---------- line : String A line of the block file chm : dict Chromosome IDs : *{chromo: id, chromo2: id2, ...}* spe_ids : dict Species and their corresponding ID *{spe1: id1, spe2: id2, ...}* blocks : list Array of blocks already read, to complete bloc_id : int Current bloc id Returns ------- list *chm:* dict, chromosome IDs : *{chromo: id, chromo2: id2, ...}* *bloc_id:* int, current bloc id """ # New block if line[0] == '>': # find line index of '>' (if not first caracter) end = [m.end() for m in re.finditer('>', line)] bloc_id = int(line[end[0]:].strip()) # Get block positions for one species elif len(line.strip()) > 0: spe = line.strip().split(".")[0] # if the species exists in our list of species ids, add to block information if spe in spe_ids: chromo = line.strip().split(".")[-1].split(":")[0] start = int(line.split(":")[-1].split("-")[0].strip()) end = int(line.split("-")[1].split()[0].strip()) orient = line.strip().split()[-1] orient = 1 if orient == "+" else -1 if not chromo.upper() in chm: chm[chromo.upper()] = max(chm.values()) + 1 if (chm[chromo.upper()] >= len(blocks[spe_ids[spe]])): nb_chr_add = chm[chromo.upper()] - len(blocks[spe_ids[spe]]) + 1 for _ in xrange(nb_chr_add): blocks[spe_ids[spe]].append([]) blocks[spe_ids[spe]][chm[chromo.upper()]].append([start, end, bloc_id, orient]) return chm, bloc_id