1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
|
#!/usr/bin/env python
#
# Copyright 2011 Free Software Foundation, Inc.
#
# This file is part of GNU Radio
#
# GNU Radio is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3, or (at your option)
# any later version.
#
# GNU Radio is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with GNU Radio; see the file COPYING. If not, write to
# the Free Software Foundation, Inc., 51 Franklin Street,
# Boston, MA 02110-1301, USA.
#
"""
This module contains functions for aligning sequences.
>>> import random
>>> rndm = random.Random()
>>> rndm.seed(1234)
>>> ran_seq = [rndm.randint(0,1) for i in range(0, 100)]
>>> offset_seq = [0] * 20 + ran_seq
>>> correct, overlap, offset = align_sequences(ran_seq, offset_seq)
>>> print(correct, overlap, offset)
(1.0, 100, -20)
>>> offset_err_seq = []
>>> for bit in offset_seq:
... if rndm.randint(0,4) == 4:
... offset_err_seq.append(rndm.randint(0,1))
... else:
... offset_err_seq.append(bit)
>>> correct, overlap, offset = align_sequences(ran_seq, offset_err_seq)
>>> print(overlap, offset)
(100, -20)
"""
import random
# DEFAULT PARAMETERS
# If the fraction of matching bits between two sequences is greater than
# this the sequences are assumed to be aligned.
def_correct_cutoff = 0.9
# The maximum offset to test during sequence alignment.
def_max_offset = 500
# The maximum number of samples to take from two sequences to check alignment.
def_num_samples = 1000
def compare_sequences(d1, d2, offset, sample_indices=None):
"""
Takes two binary sequences and an offset and returns the number of
matching entries and the number of compared entries.
d1 & d2 -- sequences
offset -- offset of d2 relative to d1
sample_indices -- a list of indices to use for the comparison
"""
max_index = min(len(d1), len(d2)+offset)
if sample_indices is None:
sample_indices = range(0, max_index)
correct = 0
total = 0
for i in sample_indices:
if i >= max_index:
break
if d1[i] == d2[i-offset]:
correct += 1
total += 1
return (correct, total)
def random_sample(size, num_samples=def_num_samples, seed=None):
"""
Returns a set of random integers between 0 and (size-1).
The set contains no more than num_samples integers.
"""
rndm = random.Random()
rndm.seed(seed)
if num_samples > size:
indices = set(range(0, size))
else:
if num_samples > size/2:
num_samples = num_samples/2
indices = set([])
while len(indices) < num_samples:
index = rndm.randint(0, size-1)
indices.add(index)
indices = list(indices)
indices.sort()
return indices
def align_sequences(d1, d2,
num_samples=def_num_samples,
max_offset=def_max_offset,
correct_cutoff=def_correct_cutoff,
seed=None,
indices=None):
"""
Takes two sequences and finds the offset and which the two sequences best
match. It returns the fraction correct, the number of entries compared,
the offset.
d1 & d2 -- sequences to compare
num_samples -- the maximum number of entries to compare
max_offset -- the maximum offset between the sequences that is checked
correct_cutoff -- If the fraction of bits correct is greater than this then
the offset is assumed to optimum.
seed -- a random number seed
indices -- an explicit list of the indices used to compare the two sequences
"""
max_overlap = max(len(d1), len(d2))
if indices is None:
indices = random_sample(max_overlap, num_samples, seed)
max_frac_correct = 0
best_offset = None
best_compared = None
best_correct = None
pos_range = range(0, min(len(d1), max_offset))
neg_range = range(-1, -min(len(d2), max_offset), -1)
# Interleave the positive and negative offsets.
int_range = [item for items in zip(pos_range, neg_range) for item in items]
for offset in int_range:
correct, compared = compare_sequences(d1, d2, offset, indices)
frac_correct = 1.0*correct/compared
if frac_correct > max_frac_correct:
max_frac_correct = frac_correct
best_offset = offset
best_compared = compared
best_correct = correct
if frac_correct > correct_cutoff:
break
return max_frac_correct, best_compared, best_offset, indices
if __name__ == "__main__":
import doctest
doctest.testmod()
|