SebastianPoell/needleman_wunsch.rb

## needleman_wunsch.rb
# This script is an implementation of the Needleman-Wunsch algorithm for
# sequence alignment (mostly used for nucleotide or protein sequences in
# bioinformatics). The algorithm works by first computing a similarity matrix
# and then backtrack the optimal paths. The scores/penalties for matches,
# mismatches and gaps can be set. Complexity is O(n*m) where n and m are the
# lengths of both sequences. No dependencies have been used.
#
# https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
#
class NeedlemanWunschAligner

  def initialize(match: 1, mismatch: -1, gap: -1)
    @match, @mismatch, @gap = match, mismatch, gap
  end

  def align(sequence_1, sequence_2, inspect: false)

    # Build similarity matrix
    matrix = build_matrix(sequence_1, sequence_2)

    # Print similarity matrix and backtrack matrix if inspect is set
    inspect(matrix) if inspect

    # Perform backtracking and return an optimal alignment
    backtrack(matrix, sequence_1, sequence_2)
  end

  private

    def build_matrix(sequence_1, sequence_2)

      # Init matrix with nils. In this implementation, each matrix entry is a
      # hash, consisting of the similarity value and backtracking information.
      # E.g. matrix[3][5] = { value: 3, backtrack: [:diagonal, :up] }
      matrix = Array.new(sequence_1.size + 1) do
        Array.new(sequence_2.size + 1)
      end

      # Fill matrix with similarity values and backtracking information.
      # sequence_1 is rows (downward) and sequence_2 is columns (rightward).
      matrix.size.times do |row|
        matrix.first.size.times do |col|

          # Compute value from the diagonal upper left
          diagonal = if row.positive? && col.positive?
            matrix[row - 1][col - 1][:value] +
            (sequence_1[row - 1] == sequence_2[col - 1] ? @match : @mismatch)
          end

          # Compute value from the left
          left = if col.positive?
            matrix[row][col - 1][:value] + @gap
          end

          # Compute value from above
          up = if row.positive?
            matrix[row - 1][col][:value] + @gap
          end

          # Compute maximum of the three values above, this is the new value
          max = [diagonal, up, left].compact.max || 0

          # Save similarity value and trackback the origin of the value
          matrix[row][col] = {
            value: max,
            backtrack: [
              (:diagonal if max == diagonal),
              (:left if max == left),
              (:up if max == up)
            ].compact
          }
        end
      end

      # Return filled matrix
      matrix
    end

    def inspect(matrix)

      # Print similarity matrix. This is done by first computing the maximum
      # width of all values and then pad every value with that max width.
      puts "Similarity matrix:"
      width = matrix.flatten.map { |v| v[:value] }.max.to_s.size + 2
      puts(matrix.map do |row|
        row.map do |entry|
          entry[:value].to_s.rjust(width)
        end.join
      end)
      puts "\n"

      # Print backtrack matrix. In order to be able to represent multiple
      # backtrack pointers, we are using simple letters here.
      puts "Backtrack matrix (l: left; u: up; d: diagonal):"
      puts(matrix.map do |row|
        row.map do |entry|
          [
            ("d" if entry[:backtrack].include?(:diagonal)),
            ("l" if entry[:backtrack].include?(:left)),
            ("u" if entry[:backtrack].include?(:up))
          ].join.rjust(3)
        end.join
      end)
      puts "\n"
    end

    def backtrack(matrix, sequence_1, sequence_2)

      # Follow backtrack pointers and compute an optimal alignment.
      # Please notice, that there often exist multiple optimal
      # alignments. For the sake of simplicity, we only track back one
      # such alginment. This could be extended in the future.
      alignment_1, alignment_2 = "", ""
      row, col = sequence_1.size, sequence_2.size
      while row > 0 || col > 0
        case matrix[row][col][:backtrack]

          # Backtrack diagonal
          when -> (t) { t.include?(:diagonal) }
            alignment_1.prepend(sequence_1[row - 1])
            alignment_2.prepend(sequence_2[col - 1])
            row -= 1; col -= 1

          # Backtrack left
          when -> (t) { t.include?(:left) }
            alignment_1.prepend("-")
            alignment_2.prepend(sequence_2[col - 1])
            col -= 1

          # Backtrack up
          when -> (t) { t.include?(:up) }
            alignment_1.prepend(sequence_1[row - 1])
            alignment_2.prepend("-")
            row -= 1
        end
      end

      # Return final optimal alignment
      [alignment_1, alignment_2]
    end
end
	# This script is an implementation of the Needleman-Wunsch algorithm for
	# sequence alignment (mostly used for nucleotide or protein sequences in
	# bioinformatics). The algorithm works by first computing a similarity matrix
	# and then backtrack the optimal paths. The scores/penalties for matches,
	# mismatches and gaps can be set. Complexity is O(n*m) where n and m are the
	# lengths of both sequences. No dependencies have been used.
	#
	# https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
	#
	class NeedlemanWunschAligner

	def initialize(match: 1, mismatch: -1, gap: -1)
	@match, @mismatch, @gap = match, mismatch, gap
	end

	def align(sequence_1, sequence_2, inspect: false)

	# Build similarity matrix
	matrix = build_matrix(sequence_1, sequence_2)

	# Print similarity matrix and backtrack matrix if inspect is set
	inspect(matrix) if inspect

	# Perform backtracking and return an optimal alignment
	backtrack(matrix, sequence_1, sequence_2)
	end

	private

	def build_matrix(sequence_1, sequence_2)

	# Init matrix with nils. In this implementation, each matrix entry is a
	# hash, consisting of the similarity value and backtracking information.
	# E.g. matrix[3][5] = { value: 3, backtrack: [:diagonal, :up] }
	matrix = Array.new(sequence_1.size + 1) do
	Array.new(sequence_2.size + 1)
	end

	# Fill matrix with similarity values and backtracking information.
	# sequence_1 is rows (downward) and sequence_2 is columns (rightward).
	matrix.size.times do \|row\|
	matrix.first.size.times do \|col\|

	# Compute value from the diagonal upper left
	diagonal = if row.positive? && col.positive?
	matrix[row - 1][col - 1][:value] +
	(sequence_1[row - 1] == sequence_2[col - 1] ? @match : @mismatch)
	end

	# Compute value from the left
	left = if col.positive?
	matrix[row][col - 1][:value] + @gap
	end

	# Compute value from above
	up = if row.positive?
	matrix[row - 1][col][:value] + @gap
	end

	# Compute maximum of the three values above, this is the new value
	max = [diagonal, up, left].compact.max \|\| 0

	# Save similarity value and trackback the origin of the value
	matrix[row][col] = {
	value: max,
	backtrack: [
	(:diagonal if max == diagonal),
	(:left if max == left),
	(:up if max == up)
	].compact
	}
	end
	end

	# Return filled matrix
	matrix
	end

	def inspect(matrix)

	# Print similarity matrix. This is done by first computing the maximum
	# width of all values and then pad every value with that max width.
	puts "Similarity matrix:"
	width = matrix.flatten.map { \|v\| v[:value] }.max.to_s.size + 2
	puts(matrix.map do \|row\|
	row.map do \|entry\|
	entry[:value].to_s.rjust(width)
	end.join
	end)
	puts "\n"

	# Print backtrack matrix. In order to be able to represent multiple
	# backtrack pointers, we are using simple letters here.
	puts "Backtrack matrix (l: left; u: up; d: diagonal):"
	puts(matrix.map do \|row\|
	row.map do \|entry\|
	[
	("d" if entry[:backtrack].include?(:diagonal)),
	("l" if entry[:backtrack].include?(:left)),
	("u" if entry[:backtrack].include?(:up))
	].join.rjust(3)
	end.join
	end)
	puts "\n"
	end

	def backtrack(matrix, sequence_1, sequence_2)

	# Follow backtrack pointers and compute an optimal alignment.
	# Please notice, that there often exist multiple optimal
	# alignments. For the sake of simplicity, we only track back one
	# such alginment. This could be extended in the future.
	alignment_1, alignment_2 = "", ""
	row, col = sequence_1.size, sequence_2.size
	while row > 0 \|\| col > 0
	case matrix[row][col][:backtrack]

	# Backtrack diagonal
	when -> (t) { t.include?(:diagonal) }
	alignment_1.prepend(sequence_1[row - 1])
	alignment_2.prepend(sequence_2[col - 1])
	row -= 1; col -= 1

	# Backtrack left
	when -> (t) { t.include?(:left) }
	alignment_1.prepend("-")
	alignment_2.prepend(sequence_2[col - 1])
	col -= 1

	# Backtrack up
	when -> (t) { t.include?(:up) }
	alignment_1.prepend(sequence_1[row - 1])
	alignment_2.prepend("-")
	row -= 1
	end
	end

	# Return final optimal alignment
	[alignment_1, alignment_2]
	end
	end