#!/usr/bin/env python
# -*- coding: utf-8 -*-:
"""
Copyright © Bonsai - LIFL (Université Lille 1, CNRS UMR 8022) and Inria-Lille Nord Europe
contact: aida.ouangraoua@inria.fr, amandine.perrin@inria.fr
This software is a computer program whose purpose is to progressively reconstruct ancestral
gene orders.
This software is governed by the CeCILL-B license under French law and
abiding by the rules of distribution of free software. You can use,
modify and/or redistribute the software under the terms of the CeCILL-B
license as circulated by CEA, CNRS and Inria at the following URL
http://www.cecill.info, or in the LICENCE file at the root directory of this program.
The fact that you are presently reading this means that you have had
knowledge of the CeCILL-B license and that you accept its terms.
---------------------------------------------------
``utils_IO`` **module description**:
This module contains some methods used by the other modules when they need to read/write into files.
.. moduleauthor:: Aïda Ouangraoua, Amandine PERRIN
May 2014
"""
try:
import cPickle as pickle
except ImportError:
import pickle
from procars.utils import util_adjacency_functions
[docs]def read_binary_file(bin_filename):
""" Reading information stored into a binary file
Parameters
----------
bin_filename : string
Name of the file in which reading information
Returns
-------
tuple
information stored in the file
"""
with open(bin_filename, "rb") as binfile:
my_unpickler = pickle.Unpickler(binfile)
information = my_unpickler.load()
return information
[docs]def read_adjacency_file(adjacency_file_name, nb_blocks, discarded=False):
""" Read an adjacency file (or a discarded adjacency file), and find all blocks
and their left and right neighbors
used in ``compute_pqtree`` and ``resolve_conflict``
Parameters
----------
adjacency_file_name : string
Name of the file containing the adjacencies to parse
nb_blocks : int
Total number of blocks
discarded : boolean
True if we are reading a file of discarded adjacencies
(and hence there can be multiple left/right neighbors), False for a file of
added adjacencies (one left/right neighbor per block_end)
Returns
-------
tuple
*left:* dict with:
- if not discarded: for each block number, its left neighbor: left[bloc2] = bloc1
- if discarded: for each block number, an array containing its potential
left neighbors: left[bloc1] = [bloc2, -bloc3,..]
*right:* dict with:
- if not discarded: for each block number, its right neighbor: right[bloc1] = bloc2
- if discarded: for each block number, an array containing its potential
right neighbors: right[bloc1] = [-bloc2, bloc3,..]
"""
left = {block_id: [] for block_id in range(1, nb_blocks + 1)}
right = {block_id: [] for block_id in range(1, nb_blocks + 1)}
with open(adjacency_file_name, "r") as adjacency_lines:
for line in adjacency_lines:
adjacency = line.split()
signed_block1 = int(adjacency[0])
signed_block2 = int(adjacency[1])
if(signed_block1 > 0):
right[signed_block1].append(signed_block2)
else:
left[-signed_block1].append(-signed_block2)
if(signed_block2 > 0):
left[signed_block2].append(signed_block1)
else:
right[-signed_block2].append(-signed_block1)
if not discarded:
left = {(key): (value[0] if value else 0) for key, value in left.iteritems()}
right = {(key): (value[0] if value else 0) for key, value in right.iteritems()}
return left, right
[docs]def read_car_file(car_file_name, nb_blocks):
""" Function reading a CAR file
Parameters
----------
car_file_name : string
Name of the file containing the current PQtree
nb_blocks : int
Total number of blocks
Returns
-------
tuple
*cars:* array of arrays (cars)
*block_to_car:* integer array such that block_to_car[block_id] = car_id to which
block_id belongs
*block_position_in_car:* integer array such that block_position_in_car[block_id] =
position of block in car to which it belongs
"""
cars = [[]] # array of cars ordered by increasing car_id
# array : block_id -> car_id to which block belongs
block_to_car = [0 for _ in xrange(nb_blocks + 1)]
# array : block_id -> position of block in its car
block_position_in_car = [0 for _ in xrange(nb_blocks + 1)]
with open(car_file_name, "r") as carlines:
car_length = [0]
current_car = 0
for line in carlines:
if(line.strip()[0] != '#'):
current_car += 1
car_string = ((line.split("_Q")[1]).split("Q_")[0]).split()
car_length.append(len(car_string))
car_int = [int(cur_car) for cur_car in car_string] # array of signed integers
for num, cur_car in enumerate(car_int):
block_to_car[abs(cur_car)] = current_car
block_position_in_car[abs(cur_car)] = num
cars.append(car_int)
return cars, block_to_car, block_position_in_car
[docs]def read_conflict_adj_file(adj_file, nb_species, leaves, tree, spe_ids):
""" Read file containing previously discarded adjacencies, and yield them with their information
Parameters
----------
adj_file : string
File in which discarded adjacencies are written
nb_species : int
Total number of species
leaves : list
List of IDs of tree leaves (= genomes)
tree : dict
A tree structure
spe_ids : dict
Species as keys, and their corresponding ID as value
Returns
-------
tuple
tuple yield for each adjacency = each line of the file:
*labels:* dict with species as keys, and an int specifying if the adjacency is present (2)
or absent (1) in the given species.
*adj_id:* int, ID of current adjacency
*adjacency:* tuple, current adjacency (num_bloc1, num_bloc2)
*step_car_adj:* tuple, current car adjacency, type of adjacency, and step at which it was
found (num_car1, num_car2, type, step)
.. warning:: These Python objects are yield and not returned
"""
with open(adj_file, "r") as adjacency_lines:
for adj_id, line in enumerate(adjacency_lines):
words = line.split()
adjacency = (int(words[0]), int(words[1])) # pair of signed blocks
# car adjacency, type of adj and step at which this adjacency was added:
step_car_adj = words[2:]
# list of labels (given adjacency present=1/absent=0) for each species
labels_list = words[-nb_species:] # nb_species last words are the labels
# leave_nums = {leaf_num: leaf_name, ...}
leave_nums = {leaf_num: tree[leaf_num][0] for leaf_num in leaves}
# leave_ids = {leaf_num: (leaf_id in spe_ids), ...}
leave_ids = {leaf_num: spe_ids[spe] for leaf_num, spe in leave_nums.iteritems()}
# labels = {leaf_num: label (1=absent/2=present)}
labels = {leaf_num: [int(labels_list[leaf_id]) + 1]
for leaf_num, leaf_id in leave_ids.iteritems()}
yield labels, adj_id, adjacency, step_car_adj
[docs]def write_retained_conflict_adjs(filename, adj_infos, maximum_set, adj_ids):
""" Write retained adjacencies after a conflict resolution into a txt file
-> next adjacency file
Parameters
----------
filename : string
Name of the file in which retained adjacencies are stored
adj_infos : dict
Dictionary with adjacency IDs as keys, and values are a list with the car adjacency
(car1, car2), the type of adjacency, the step at which it was found and the presence
of this adjacency in each species
maximum_set : list
List of retained adjacency ids
adj_ids : dict
Keys are adjacency IDs, and values are the adjacency corresponding to the ID
"""
with open(filename, "w") as output_file:
# write added adjacencies at the beginning of the file
for adj_id in maximum_set:
# add block adj and car adj and type of adj with all information
words = list(adj_ids[adj_id]) + list(adj_infos[adj_id][: 3])
words += [int(adj_infos[adj_id][3])] # step number
words += [int(lab) for lab in adj_infos[adj_id][4:]] # labels
str_words = [str(num) for num in words]
str_words = " ".join(str_words)
output_file.write(str_words + "\n")
[docs]def write_adjacency(output_file, cars, car_adjacency, adj_type, step_nb, labels):
""" Writes all adjacency information into a file handler
Parameters
----------
output_file : FileHandler
Open file or StringIO in which writting status of adjacencies
cars : list
List of current cars
car_adjacency : list
List of two signed blocks (a given adjacency)
adj_type : int
Type of adjacency : 0 if fully, 1 if partly
step_nb : int
Current step of the ProCars Method
labels : string
More information (presence/absence of the adjacency in each species)
"""
signed_block1 = util_adjacency_functions.car_to_block(car_adjacency[0], 1, cars)
signed_block2 = util_adjacency_functions.car_to_block(car_adjacency[1], 0, cars)
output_file.write(str(signed_block1) + " " + str(signed_block2) + " " +
str(car_adjacency[0]) + " " + str(car_adjacency[1]) + " " +
str(adj_type) + " " + str(step_nb) + labels + "\n")
[docs]def write_car_file(car_filename, all_cars):
""" Writes all CARs into the given file
Parameters
----------
car_filename : String
Name of the file in which all CARs are stored (= PQtree file)
all_cars : list
List of lists, such that all_cars[car_num] = [bloc1, bloc2, ...] = all ordered signed
blocks of car number *car_num*
"""
with open(car_filename, "w") as carfile:
for index_car, car in enumerate(all_cars):
car_line = "_Q "
car_line += " ".join([str(signed_block) for signed_block in car]) + " Q_\n"
carfile.write("#CAR" + str(index_car + 1) + "\n")
carfile.write(car_line)