Comparing files (tab separated, UTF-8)

Last edited on

Introduction

For some tests it is necessary to compare file contents partially.

This example demonstrates comparing selected column values in tab separated (and UTF-8 encoded) files:

import codecs

def main():
    fn1 = "testfile1.tsv"
    fn2 = "testfile1.tsv"
    msg = "File #1: " + fn1 + "; File #2: " + fn2
    msg = "File #1: " + fn1 + "; File #2: " + fn2
    if compare_tsv_values_float(fn1, fn2, start_row=2, end_row=1139, columns_to_compare=[11, 12, 13, 15, 16, 17]):
        test.passes("Comparison succeeded: " + msg)
    else:
        test.fail("Comparison failed: " + msg)

def compare_tsv_values_float(file_name1, file_name2, start_row=1, end_row=None, columns_to_compare=[0], valid_tolerance=0.0, log_passes=False, log_fails=True):
    dataset1 = read_tsv_utf8(file_name1)
    dataset2 = read_tsv_utf8(file_name2)
    successful = True
    if end_row is not None and len(dataset1) < end_row:
        test.fatal("Invalid end_row " + str(end_row) + "; file " + file_name1 + " only contains " + str(len(dataset1)) + " rows")
    if end_row is not None and len(dataset2) < end_row:
        test.fatal("Invalid end_row " + str(end_row) + "; file " + file_name2 + " only contains " + str(len(dataset2)) + " rows")
    if end_row is None:
        end_row = len(dataset1)
        if len(dataset1) < len(dataset2):
            end_row = len(dataset2)
    for row in range(start_row, end_row):
        if end_row is not None and row > end_row:
            break
        row1 = dataset1[row]
        row2 = dataset2[row]
        for col in columns_to_compare:
            val1 = float(row1[col])
            val2 = float(row2[col])
            diff = val1 - val2
            msg = "Row/column " + str(row) + "/" + str(col) + ": Difference: " + str(diff)
            if abs(diff) <= valid_tolerance:
                if log_passes:
                    test.log("PASSED: " + msg, repr(row1) + "\r\n" + repr(row2))
            else:
                successful = False
                if log_fails:
                    test.log("FAILED: " + msg, repr(row1) + "\r\n" + repr(row2))
    return successful

def read_tsv_utf8(file_name):
    f = codecs.open(file_name, "r", "utf8")
    s = f.read()
    f.close()
    s = s.replace("\r\n", "\n")
    s = s.replace("\r", "\n")
    lines = s.split("\n")
    result = []
    for l in lines:
        result.append(l.split("\t"))
    return result
test.py