blob: 777bd79671aea3f6dbbe4cc36bd6bfcd35a25e88 [file] [log] [blame] [edit]
import difflib
import argparse
from pathlib import Path
import re
import log_printer
def junk_characters(a):
if a in ' \t\"':
return True
else:
return False
def clean_lines(
lines,
remove_comments=True,
move_entry_to_newline=True,
remove_quotes=True,
remove_whitespaces=True,
unify_numbers=True,
remove_line_breaks=True):
'''Performs file preprocessing before running diff.
It is used for removing all semantically irrelevant characters that may
blur the real differences between two files. By default it converts all
tabs to single whitespace.
Parameters
----------
lines: list
list of Liberty lines to clean
remove_comments: bool
removes comments if True
move_entry_to_newline: bool
if True and when there is content present after the closing brace, it
is moved to newline
remove_quotes: bool
remove all quotes characters from file if True
remove_whitespaces: bool
remove all whitespaces if True
unify_numbers: bool
unifies the number notation if True
remove_line_breaks:
when the line is broken into multiple lines with backslash, convert it
to single line if True
Returns
-------
list: cleaned lines
'''
# join all lines into single string
fullfile = '\n'.join(lines)
if remove_comments:
# remove comments (C/C++ style)
fullfile = re.sub(r'(?:\/\*(.*?)\*\/)|(?:\/\/(.*?))', '',
fullfile, flags=re.DOTALL)
# remove comments (Python style)
fullfile = re.sub(r'#[^\n]*\n', '', fullfile, flags=re.DOTALL)
# replace all tabs with single space
fullfile = fullfile.replace('\t', ' ')
if move_entry_to_newline:
# move non-whitespace content after } to new line
fullfile = re.sub(r'}\s*(?!\n)', '}\n', fullfile, flags=re.DOTALL)
if remove_quotes:
# remove quotes
fullfile = fullfile.replace('"', '')
if remove_whitespaces:
# remove whitespaces
fullfile = fullfile.replace(' ', '')
if remove_line_breaks:
fullfile = re.sub(r'\\\s*\n', '', fullfile, flags=re.DOTALL)
# split single string into lines
lines = fullfile.split('\n')
fullfile = ''
if remove_whitespaces:
# remove empty lines and trailing whitespaces
lines = [line.rstrip() for line in lines if line.strip()]
if unify_numbers:
floats = re.compile(
r'(?P<number>[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)')
for i in range(len(lines)):
lines[i] = floats.sub(
lambda m: str(float(m.group('number'))),
lines[i])
# add newlines at the end of each line
lines = [line + '\n' for line in lines]
return lines
def diff_files(
in1,
in2,
print_diff=True,
html_path=None,
html_row_width=100,
return_similarity=False,
similarity_method='quick'):
'''Compares two cleaned Liberty files and returns desired representations.
Parameters
----------
in1: list
List of cleaned lines for first Liberty file
in2: list
List of cleaned lines for second Liberty file
print_diff: bool
If True, function will print Differ-like diff to standard output
html_path: str
If None, does nothing. Otherwise, it generates HTML file containing
side-by-side comparison of cleaned Liberty files in the path
specified in the parameter
html_row_width: int
Number of characters within single row of one side of the HTML
comparison
return_similarity: bool
If True, function will return the value from 0 to 1 representing the
similarity between files, where 0 means they are completely different,
and 1 means they are exactly the same
similarity_method: str
Method used for computing similarity measure. Values can be:
* normal - slow, exact comparison,
* quick - faster, less exact comparison that returns upper bound for
normal similarity ratio
* real_quick - very fast, very inaccurate
Returns
-------
float: similarity measure result if `return_similarity` is True,
otherwise None
'''
if html_path:
diff = difflib.HtmlDiff(
charjunk=junk_characters,
tabsize=4,
wrapcolumn=html_row_width)
with open(html_path, 'w') as outfile:
result = diff.make_file(in1, in2, context=True)
outfile.write(result)
if print_diff:
diff = difflib.ndiff(
in1,
in2,
charjunk=junk_characters)
print(''.join(diff))
if return_similarity:
seqmatcher = difflib.SequenceMatcher(None, ''.join(in1), ''.join(in2))
if similarity_method == 'normal':
return seqmatcher.ratio()
elif similarity_method == 'quick':
return seqmatcher.quick_ratio()
elif similarity_method == 'real_quick':
return seqmatcher.real_quick_ratio()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
"input1",
help="First Liberty file",
type=Path)
parser.add_argument(
"input2",
help="Second Liberty file",
type=Path)
parser.add_argument(
"--output-html",
help="Generate side-by-side diff to HTML file with given path",
type=Path)
parser.add_argument(
"--html-row-width",
help="Row width of the HTML side-by-side column",
default=100,
type=int)
parser.add_argument(
"--not-remove-comments",
help="Do not remove comments before comparing files",
action="store_true")
parser.add_argument(
"--not-move-entry-to-newline",
help="Do not move content after closing bracket '}' to newline",
action="store_true")
parser.add_argument(
"--not-remove-quotes",
help="Do not remove quotes '\"' from documents before comparison",
action="store_true")
parser.add_argument(
"--not-remove-whitespaces",
help="Do not remove comments before comparing files",
action="store_true")
parser.add_argument(
"--not-remove-line-breaks",
help=r"Do not remove line breaks '\\n'",
action="store_true")
parser.add_argument(
"--not-unify-numbers",
help="Do not convert numbers in both files to unified form",
action="store_true")
parser.add_argument(
"--print-diff",
help="Print the diff to stdout",
action="store_true")
parser.add_argument(
"--compute-similarity",
help="Computes and prints the similarity between files (0-1)",
action="store_true")
parser.add_argument(
"--similarity-method",
help="The method used for computing similarity",
type=str,
default='quick',
choices=['normal', 'quick', 'real_quick'])
parser.add_argument(
"--log-suppress-below",
help="The mininal not suppressed log level",
type=str,
default="ERROR",
choices=log_printer.LOGLEVELS)
args = parser.parse_args()
log_printer.SUPPRESSBELOW = args.log_suppress_below
with open(args.input1, 'r') as input1:
in1 = input1.readlines()
with open(args.input2, 'r') as input2:
in2 = input2.readlines()
in1 = clean_lines(
in1,
not args.not_remove_comments,
not args.not_move_entry_to_newline,
not args.not_remove_quotes,
not args.not_remove_whitespaces,
not args.not_unify_numbers,
not args.not_remove_line_breaks)
in2 = clean_lines(
in2,
not args.not_remove_comments,
not args.not_move_entry_to_newline,
not args.not_remove_quotes,
not args.not_remove_whitespaces,
not args.not_unify_numbers,
not args.not_remove_line_breaks)
similarity = diff_files(
in1,
in2,
args.print_diff,
args.output_html,
args.html_row_width,
args.compute_similarity,
args.similarity_method)
if args.compute_similarity:
print('Similarity between documents: {}'.format(similarity))