Last active
April 16, 2022 13:23
-
-
Save SebastianPoell/a06d38ba3d9126754d6197596bd8c3a3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This script is an implementation of the Needleman-Wunsch algorithm for | |
# sequence alignment (mostly used for nucleotide or protein sequences in | |
# bioinformatics). The algorithm works by first computing a similarity matrix | |
# and then backtrack the optimal paths. The scores/penalties for matches, | |
# mismatches and gaps can be set. Complexity is O(n*m) where n and m are the | |
# lengths of both sequences. No dependencies have been used. | |
# | |
# https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm | |
# | |
class NeedlemanWunschAligner | |
def initialize(match: 1, mismatch: -1, gap: -1) | |
@match, @mismatch, @gap = match, mismatch, gap | |
end | |
def align(sequence_1, sequence_2, inspect: false) | |
# Build similarity matrix | |
matrix = build_matrix(sequence_1, sequence_2) | |
# Print similarity matrix and backtrack matrix if inspect is set | |
inspect(matrix) if inspect | |
# Perform backtracking and return an optimal alignment | |
backtrack(matrix, sequence_1, sequence_2) | |
end | |
private | |
def build_matrix(sequence_1, sequence_2) | |
# Init matrix with nils. In this implementation, each matrix entry is a | |
# hash, consisting of the similarity value and backtracking information. | |
# E.g. matrix[3][5] = { value: 3, backtrack: [:diagonal, :up] } | |
matrix = Array.new(sequence_1.size + 1) do | |
Array.new(sequence_2.size + 1) | |
end | |
# Fill matrix with similarity values and backtracking information. | |
# sequence_1 is rows (downward) and sequence_2 is columns (rightward). | |
matrix.size.times do |row| | |
matrix.first.size.times do |col| | |
# Compute value from the diagonal upper left | |
diagonal = if row.positive? && col.positive? | |
matrix[row - 1][col - 1][:value] + | |
(sequence_1[row - 1] == sequence_2[col - 1] ? @match : @mismatch) | |
end | |
# Compute value from the left | |
left = if col.positive? | |
matrix[row][col - 1][:value] + @gap | |
end | |
# Compute value from above | |
up = if row.positive? | |
matrix[row - 1][col][:value] + @gap | |
end | |
# Compute maximum of the three values above, this is the new value | |
max = [diagonal, up, left].compact.max || 0 | |
# Save similarity value and trackback the origin of the value | |
matrix[row][col] = { | |
value: max, | |
backtrack: [ | |
(:diagonal if max == diagonal), | |
(:left if max == left), | |
(:up if max == up) | |
].compact | |
} | |
end | |
end | |
# Return filled matrix | |
matrix | |
end | |
def inspect(matrix) | |
# Print similarity matrix. This is done by first computing the maximum | |
# width of all values and then pad every value with that max width. | |
puts "Similarity matrix:" | |
width = matrix.flatten.map { |v| v[:value] }.max.to_s.size + 2 | |
puts(matrix.map do |row| | |
row.map do |entry| | |
entry[:value].to_s.rjust(width) | |
end.join | |
end) | |
puts "\n" | |
# Print backtrack matrix. In order to be able to represent multiple | |
# backtrack pointers, we are using simple letters here. | |
puts "Backtrack matrix (l: left; u: up; d: diagonal):" | |
puts(matrix.map do |row| | |
row.map do |entry| | |
[ | |
("d" if entry[:backtrack].include?(:diagonal)), | |
("l" if entry[:backtrack].include?(:left)), | |
("u" if entry[:backtrack].include?(:up)) | |
].join.rjust(3) | |
end.join | |
end) | |
puts "\n" | |
end | |
def backtrack(matrix, sequence_1, sequence_2) | |
# Follow backtrack pointers and compute an optimal alignment. | |
# Please notice, that there often exist multiple optimal | |
# alignments. For the sake of simplicity, we only track back one | |
# such alginment. This could be extended in the future. | |
alignment_1, alignment_2 = "", "" | |
row, col = sequence_1.size, sequence_2.size | |
while row > 0 || col > 0 | |
case matrix[row][col][:backtrack] | |
# Backtrack diagonal | |
when -> (t) { t.include?(:diagonal) } | |
alignment_1.prepend(sequence_1[row - 1]) | |
alignment_2.prepend(sequence_2[col - 1]) | |
row -= 1; col -= 1 | |
# Backtrack left | |
when -> (t) { t.include?(:left) } | |
alignment_1.prepend("-") | |
alignment_2.prepend(sequence_2[col - 1]) | |
col -= 1 | |
# Backtrack up | |
when -> (t) { t.include?(:up) } | |
alignment_1.prepend(sequence_1[row - 1]) | |
alignment_2.prepend("-") | |
row -= 1 | |
end | |
end | |
# Return final optimal alignment | |
[alignment_1, alignment_2] | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Use it like that: