import codecs
import sys
import time
def main( argv ):
if len( argv ) != 1:
print_usage()
return
lines = read_file_lines_utf8( argv[0] )
stripped_lines = {}
res = []
for l in lines:
sl = l.replace( "\r", "" )
sl = sl.replace( "\n", "" )
if sl in stripped_lines:
print 'Removed line: "%s"' % sl
continue
stripped_lines[sl] = ""
res.append( l )
if len( lines ) == len( res ):
print "No duplicate lines found."
else:
write_file_lines_utf8( argv[0], res )
def print_usage():
print
print "USAGE:"
print
print " %s utf8_encoded_text_file_to_remove_duplicate_lines_from" % sys.argv[0]
def read_file_lines_utf8( file_name ):
res = []
f = codecs.open( file_name, "r", "utf8" )
lines = f.readlines()
f.close()
for l in lines:
# Strip the BOM from the beginning of the Unicode string, if it exists
l = l.lstrip( unicode( codecs.BOM_UTF8, "utf8" ) )
res.append( l )
return res
def write_file_lines_utf8( file_name, content_lines ):
f = codecs.open( file_name, "w", "utf8" )
f.writelines( content_lines )
f.close()
if __name__ == "__main__":
start_time = time.time()
argv = sys.argv[1:]
if len( argv ) == 0 or len( argv ) > 1:
print_usage()
exit( -1 )
main( argv )
t = time.time() - start_time
print "Done after %.2f seconds." % t