Source code for procars.utils.utils_IO

#!/usr/bin/env python
# -*- coding: utf-8 -*-:

"""
Copyright © Bonsai - LIFL (Université Lille 1, CNRS UMR 8022) and Inria-Lille Nord Europe

contact: aida.ouangraoua@inria.fr, amandine.perrin@inria.fr

This software is a computer program whose purpose is to progressively reconstruct ancestral
gene orders.

This software is governed by the CeCILL-B license under French law and
abiding by the rules of distribution of free software. You can use,
modify and/or redistribute the software under the terms of the CeCILL-B
license as circulated by CEA, CNRS and Inria at the following URL
http://www.cecill.info, or in the LICENCE file at the root directory of this program.

The fact that you are presently reading this means that you have had
knowledge of the CeCILL-B license and that you accept its terms.

---------------------------------------------------

``utils_IO`` **module description**:

This module contains some methods used by the other modules when they need to read/write into files.

.. moduleauthor:: Aïda Ouangraoua, Amandine PERRIN

May 2014

"""

try:
    import cPickle as pickle
except ImportError:
    import pickle
from procars.utils import util_adjacency_functions


[docs]def save_binary_information(bin_filename, information): """ Saving information into a binary file Parameters ---------- bin_filename : String Name of the binary file in which saving the CARs information information : list Information we need to save into a binary file """ with open(bin_filename, "wb") as binary_file: my_pickler = pickle.Pickler(binary_file) my_pickler.dump(information)
[docs]def read_binary_file(bin_filename): """ Reading information stored into a binary file Parameters ---------- bin_filename : string Name of the file in which reading information Returns ------- tuple information stored in the file """ with open(bin_filename, "rb") as binfile: my_unpickler = pickle.Unpickler(binfile) information = my_unpickler.load() return information
[docs]def read_adjacency_file(adjacency_file_name, nb_blocks, discarded=False): """ Read an adjacency file (or a discarded adjacency file), and find all blocks and their left and right neighbors used in ``compute_pqtree`` and ``resolve_conflict`` Parameters ---------- adjacency_file_name : string Name of the file containing the adjacencies to parse nb_blocks : int Total number of blocks discarded : boolean True if we are reading a file of discarded adjacencies (and hence there can be multiple left/right neighbors), False for a file of added adjacencies (one left/right neighbor per block_end) Returns ------- tuple *left:* dict with: - if not discarded: for each block number, its left neighbor: left[bloc2] = bloc1 - if discarded: for each block number, an array containing its potential left neighbors: left[bloc1] = [bloc2, -bloc3,..] *right:* dict with: - if not discarded: for each block number, its right neighbor: right[bloc1] = bloc2 - if discarded: for each block number, an array containing its potential right neighbors: right[bloc1] = [-bloc2, bloc3,..] """ left = {block_id: [] for block_id in range(1, nb_blocks + 1)} right = {block_id: [] for block_id in range(1, nb_blocks + 1)} with open(adjacency_file_name, "r") as adjacency_lines: for line in adjacency_lines: adjacency = line.split() signed_block1 = int(adjacency[0]) signed_block2 = int(adjacency[1]) if(signed_block1 > 0): right[signed_block1].append(signed_block2) else: left[-signed_block1].append(-signed_block2) if(signed_block2 > 0): left[signed_block2].append(signed_block1) else: right[-signed_block2].append(-signed_block1) if not discarded: left = {(key): (value[0] if value else 0) for key, value in left.iteritems()} right = {(key): (value[0] if value else 0) for key, value in right.iteritems()} return left, right
[docs]def read_car_file(car_file_name, nb_blocks): """ Function reading a CAR file Parameters ---------- car_file_name : string Name of the file containing the current PQtree nb_blocks : int Total number of blocks Returns ------- tuple *cars:* array of arrays (cars) *block_to_car:* integer array such that block_to_car[block_id] = car_id to which block_id belongs *block_position_in_car:* integer array such that block_position_in_car[block_id] = position of block in car to which it belongs """ cars = [[]] # array of cars ordered by increasing car_id # array : block_id -> car_id to which block belongs block_to_car = [0 for _ in xrange(nb_blocks + 1)] # array : block_id -> position of block in its car block_position_in_car = [0 for _ in xrange(nb_blocks + 1)] with open(car_file_name, "r") as carlines: car_length = [0] current_car = 0 for line in carlines: if(line.strip()[0] != '#'): current_car += 1 car_string = ((line.split("_Q")[1]).split("Q_")[0]).split() car_length.append(len(car_string)) car_int = [int(cur_car) for cur_car in car_string] # array of signed integers for num, cur_car in enumerate(car_int): block_to_car[abs(cur_car)] = current_car block_position_in_car[abs(cur_car)] = num cars.append(car_int) return cars, block_to_car, block_position_in_car
[docs]def read_conflict_adj_file(adj_file, nb_species, leaves, tree, spe_ids): """ Read file containing previously discarded adjacencies, and yield them with their information Parameters ---------- adj_file : string File in which discarded adjacencies are written nb_species : int Total number of species leaves : list List of IDs of tree leaves (= genomes) tree : dict A tree structure spe_ids : dict Species as keys, and their corresponding ID as value Returns ------- tuple tuple yield for each adjacency = each line of the file: *labels:* dict with species as keys, and an int specifying if the adjacency is present (2) or absent (1) in the given species. *adj_id:* int, ID of current adjacency *adjacency:* tuple, current adjacency (num_bloc1, num_bloc2) *step_car_adj:* tuple, current car adjacency, type of adjacency, and step at which it was found (num_car1, num_car2, type, step) .. warning:: These Python objects are yield and not returned """ with open(adj_file, "r") as adjacency_lines: for adj_id, line in enumerate(adjacency_lines): words = line.split() adjacency = (int(words[0]), int(words[1])) # pair of signed blocks # car adjacency, type of adj and step at which this adjacency was added: step_car_adj = words[2:] # list of labels (given adjacency present=1/absent=0) for each species labels_list = words[-nb_species:] # nb_species last words are the labels # leave_nums = {leaf_num: leaf_name, ...} leave_nums = {leaf_num: tree[leaf_num][0] for leaf_num in leaves} # leave_ids = {leaf_num: (leaf_id in spe_ids), ...} leave_ids = {leaf_num: spe_ids[spe] for leaf_num, spe in leave_nums.iteritems()} # labels = {leaf_num: label (1=absent/2=present)} labels = {leaf_num: [int(labels_list[leaf_id]) + 1] for leaf_num, leaf_id in leave_ids.iteritems()} yield labels, adj_id, adjacency, step_car_adj
[docs]def write_retained_conflict_adjs(filename, adj_infos, maximum_set, adj_ids): """ Write retained adjacencies after a conflict resolution into a txt file -> next adjacency file Parameters ---------- filename : string Name of the file in which retained adjacencies are stored adj_infos : dict Dictionary with adjacency IDs as keys, and values are a list with the car adjacency (car1, car2), the type of adjacency, the step at which it was found and the presence of this adjacency in each species maximum_set : list List of retained adjacency ids adj_ids : dict Keys are adjacency IDs, and values are the adjacency corresponding to the ID """ with open(filename, "w") as output_file: # write added adjacencies at the beginning of the file for adj_id in maximum_set: # add block adj and car adj and type of adj with all information words = list(adj_ids[adj_id]) + list(adj_infos[adj_id][: 3]) words += [int(adj_infos[adj_id][3])] # step number words += [int(lab) for lab in adj_infos[adj_id][4:]] # labels str_words = [str(num) for num in words] str_words = " ".join(str_words) output_file.write(str_words + "\n")
[docs]def write_adjacency(output_file, cars, car_adjacency, adj_type, step_nb, labels): """ Writes all adjacency information into a file handler Parameters ---------- output_file : FileHandler Open file or StringIO in which writting status of adjacencies cars : list List of current cars car_adjacency : list List of two signed blocks (a given adjacency) adj_type : int Type of adjacency : 0 if fully, 1 if partly step_nb : int Current step of the ProCars Method labels : string More information (presence/absence of the adjacency in each species) """ signed_block1 = util_adjacency_functions.car_to_block(car_adjacency[0], 1, cars) signed_block2 = util_adjacency_functions.car_to_block(car_adjacency[1], 0, cars) output_file.write(str(signed_block1) + " " + str(signed_block2) + " " + str(car_adjacency[0]) + " " + str(car_adjacency[1]) + " " + str(adj_type) + " " + str(step_nb) + labels + "\n")
[docs]def write_car_file(car_filename, all_cars): """ Writes all CARs into the given file Parameters ---------- car_filename : String Name of the file in which all CARs are stored (= PQtree file) all_cars : list List of lists, such that all_cars[car_num] = [bloc1, bloc2, ...] = all ordered signed blocks of car number *car_num* """ with open(car_filename, "w") as carfile: for index_car, car in enumerate(all_cars): car_line = "_Q " car_line += " ".join([str(signed_block) for signed_block in car]) + " Q_\n" carfile.write("#CAR" + str(index_car + 1) + "\n") carfile.write(car_line)