|
# Copyright (C) 2009-2010 by Fog Creek Software. All rights reserved.
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2, incorporated herein by reference.
import difflib
import re
from pygments import highlight
from pygments.lexers import get_lexer_for_filename, guess_lexer_for_filename, TextLexer
from pygments.formatters import HtmlFormatter
EXTENSION_MAP = {'resx': 'xml',
'csproj': 'xml',
'was': 'vb',
'vbs': 'vb',
'fbp5': 'xml',
'xul': 'xml',
'ipp': 'cpp',
'jsm': 'js'}
LINE_MAX = 20000
def ensurenewline(s):
return s if s.endswith('\n') else s + '\n'
def tweak(filename):
"""change filename to a known extension, if applicable"""
(filename, extension) = filename.split('/')[-1].rsplit('.', 1)
extension = EXTENSION_MAP.get(extension, extension)
return filename + '.' + extension
def lexer(filename, content=None):
"""select an appropriate lexer based on the filename"""
try:
if content:
l = guess_lexer_for_filename(tweak(filename), content, stripnl=False)
else:
l = get_lexer_for_filename(tweak(filename), stripnl=False)
except:
l = TextLexer(stripnl=False)
l.add_filter('whitespace', spaces=True, wstokentype=False)
return l
class IntralineHtmlFormatter(HtmlFormatter):
in_change = False
ranges = []
def __init__(self, ranges=None, *args, **kw):
if ranges:
self.ranges = ranges
HtmlFormatter.__init__(self, *args, **kw)
def _split_change_markers(self, tokensource):
'''Pre-process the token stream before it is formatted, to mark the tokens that should be highlighted for intraline diffs.'''
ranges = self.ranges or []
pos = 0
for ttype, value in tokensource:
for value in value.splitlines(True):
l = len(value)
range = None
rr = [r for r in ranges if (r[0] <= pos <= r[1]) or (pos <= r[0] <= r[1] <= pos + l) or (r[0] <= pos + l <= r[1])]
if not rr:
yield ttype, value
pos += l
continue
last = None
for r in rr:
if r[0] <= pos:
# r starts at or before token
if r[1] <= pos + l:
# range covers prefix of token
self.in_change = True
i = r[1] - pos
yield ttype, value[:i]
self.in_change = False
else:
# range covers whole token
self.in_change = True
yield ttype, value
self.in_change = False
else:
# r starts in the middle of the token
i = last[1] - pos if last else 0
j = r[0] - pos
yield ttype, value[i:j]
if r[1] <= pos + l:
# range covers middle chunk
self.in_change = True
i = r[0] - pos
j = r[1] - pos
yield ttype, value[i:j]
self.in_change = False
else:
# range covers suffix of token
self.in_change = True
i = r[0] - pos
yield ttype, value[i:]
self.in_change = False
last = r
if last[1] <= pos + l:
i = last[1] - pos
yield ttype, value[i:]
pos += l
def _format_lines(self, tokensource):
return super(IntralineHtmlFormatter, self)._format_lines(self._split_change_markers(tokensource))
def _get_css_class(self, ttype):
return super(IntralineHtmlFormatter, self)._get_css_class(ttype) + (' ch' if self.in_change else '')
def highlighted(lex, code, ranges=None):
return highlight(code, lex, IntralineHtmlFormatter(ranges, nowrap=True))
def highlight_patch(lex, lines, ranges=None):
lines = [(line[0], ensurenewline(line[1:LINE_MAX])) for line in lines]
for x in xrange(0, len(lines)):
if lines[x][0] == '\\':
lines[x] = (lines[x][0], '\n')
patch = ''.join(l[1] for l in lines)
patch = highlighted(lex, patch, ranges).splitlines(True)
for x in xrange(0, min(len(patch), len(lines))):
if lines[x][0] == '\\':
lines[x] = (lines[x][0], ' No newline at end of file\n')
else:
lines[x] = (lines[x][0], patch[x])
return ''.join(line[0] + line[1] for line in lines)
# returns a list of ranges (a, b), marking that characters a:b in the patch are changed.
def intraline_diff(patch):
removed_lines = []
added_lines = []
ranges = []
l = 0
for line in patch + [' ']:
if line[0] == '-':
removed_lines.append(line[1:])
elif line[0] == '+':
added_lines.append(line[1:])
else:
if added_lines or removed_lines:
rtotal = sum(len(s) for s in removed_lines)
atotal = sum(len(s) for s in added_lines)
# split the diff text into whole words and individual non-word characters
removed_words = [w for w in re.split(r'(\w+|\W)', ''.join(removed_lines)) if w]
added_words = [w for w in re.split(r'(\w+|\W)', ''.join(added_lines)) if w]
removed, added = l, l + rtotal
seq = difflib.SequenceMatcher();
seq.set_seqs(removed_words, added_words)
# find the matching words of each string, using the ranges in each opcode.
# 'equal' action is for non-changed text; otherwise, mark the range as changed.
for (action, r1, r2, a1, a2) in seq.get_opcodes():
ac = ''.join(added_words[a1:a2])
rc = ''.join(removed_words[r1:r2])
a = len(ac)
r = len(rc)
added += a
removed += r
if action == 'equal':
continue
if a != 0:
ranges.append((added - a, added))
if r != 0:
ranges.append((removed - r, removed))
l += atotal + rtotal
removed_lines = []
added_lines = []
l += len(line) - 1
return sorted(ranges)
def format(filename, diff):
if not diff:
return None
formatted = []
patch = []
if not isinstance(diff, unicode):
diff_asc = diff
else:
diff_asc = diff.encode('utf-8')
diff_asc = diff_asc.replace('\r', '')
lines = diff_asc.splitlines(True)
if isinstance(diff, unicode):
lines = [l.decode('utf-8') for l in lines]
lex = lexer(filename)
for line in lines:
if line.startswith(u'@@'):
if patch: formatted.extend(highlight_patch(lex, patch, intraline_diff(patch)))
formatted.append(line)
patch = []
else:
patch.append(line)
if patch: formatted.extend(highlight_patch(lex, patch, intraline_diff(patch)))
return ''.join(formatted)
def format_diffs(diffs):
for d in diffs:
d['formatted_diff'] = format(d['file']['name'], d['diff'])
def format_file(filename, contents):
lines = [line[:LINE_MAX] for line in contents.replace('\r', '').split('\n')]
return highlighted(lexer(filename), '\n'.join(lines))
|
Loading...