Removing duplicate lines from text files

Last edited on

import codecs
import sys
import time

def main( argv ):
    if len( argv ) != 1:
        print_usage()
        return
    lines = read_file_lines_utf8( argv[0] )
    stripped_lines = {}
    res = []
    for l in lines:
        sl = l.replace( "\r", "" )
        sl = sl.replace( "\n", "" )
        if sl in stripped_lines:
            print 'Removed line: "%s"' % sl
            continue
        stripped_lines[sl] = ""
        res.append( l )
    if len( lines ) == len( res ):
        print "No duplicate lines found."
    else:
        write_file_lines_utf8( argv[0], res )



def print_usage():
    print
    print "USAGE:"
    print
    print "  %s utf8_encoded_text_file_to_remove_duplicate_lines_from" % sys.argv[0]

def read_file_lines_utf8( file_name ):
    res = []
    f = codecs.open( file_name, "r", "utf8" )
    lines = f.readlines()
    f.close()
    for l in lines:
        # Strip the BOM from the beginning of the Unicode string, if it exists
        l = l.lstrip( unicode( codecs.BOM_UTF8, "utf8" ) )
        res.append( l )
    return res

def write_file_lines_utf8( file_name, content_lines ):
    f = codecs.open( file_name, "w", "utf8" )
    f.writelines( content_lines )
    f.close()

if __name__ == "__main__":
    start_time = time.time()

    argv = sys.argv[1:]
    if len( argv ) == 0 or len( argv ) > 1:
        print_usage()
        exit( -1 )

    main( argv )

    t = time.time() - start_time
    print "Done after %.2f seconds." % t
remove_duplicates.py