Created
July 3, 2013 15:55
-
-
Save matze/5919701 to your computer and use it in GitHub Desktop.
Diff two texts and markup result with <del> and <ins>.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re | |
import difflib | |
import codecs | |
from itertools import chain | |
_diff_split_re = re.compile(r'(\s+)(?u)') | |
def text_split(text): | |
worditer = chain([u''], _diff_split_re.split(text)) | |
return [x + worditer.next() for x in worditer] | |
def diff(old_text, new_text): | |
old = text_split(old_text) | |
new = text_split(new_text) | |
matcher = difflib.SequenceMatcher(None, old, new) | |
def wrap(tag, words): | |
return u'<{0}>{1}</{0}>'.format(tag, u''.join(words)) | |
for tag, i1, i2, j1, j2 in matcher.get_opcodes(): | |
if tag == 'replace': | |
yield wrap('del', old[i1:i2]) | |
yield wrap('ins', new[j1:j2]) | |
elif tag == 'delete': | |
yield wrap('del', old[i1:i2]) | |
elif tag == 'insert': | |
yield wrap('ins', new[j1:j2]) | |
else: | |
yield u''.join(old[i1:i2]) | |
def merge_lines(seq): | |
if not seq: | |
return None | |
s = u'' | |
merged = [] | |
for line in seq: | |
if line != u'': | |
s += line + ' ' | |
else: | |
merged.append(s) | |
s = u'' | |
return merged | |
def render_html_diff(old_text, new_text): | |
s = u'' | |
old = old_text.split('\n') | |
new = new_text.split('\n') | |
for l1, l2 in zip(merge_lines(old), merge_lines(new)): | |
s += u''.join((w for w in diff(l1, l2))) + u'\n\n' | |
return s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment