#!/usr/bin/env python # # A format checker for LIBSVM # # # Copyright (c) 2007, Rong-En Fan # # All rights reserved. # # This program is distributed under the same license of the LIBSVM package. # from sys import argv, exit import os.path def err(line_no, msg): print("line {0}: {1}".format(line_no, msg)) # works like float() but does not accept nan and inf def my_float(x): if x.lower().find("nan") != -1 or x.lower().find("inf") != -1: raise ValueError return float(x) def main(): if len(argv) != 2: print("Usage: {0} dataset".format(argv[0])) exit(1) dataset = argv[1] if not os.path.exists(dataset): print("dataset {0} not found".format(dataset)) exit(1) line_no = 1 error_line_count = 0 for line in open(dataset, 'r'): line_error = False # each line must end with a newline character if line[-1] != '\n': err(line_no, "missing a newline character in the end") line_error = True nodes = line.split() # check label try: label = nodes.pop(0) if label.find(',') != -1: # multi-label format try: for l in label.split(','): l = my_float(l) except: err(line_no, "label {0} is not a valid multi-label form".format(label)) line_error = True else: try: label = my_float(label) except: err(line_no, "label {0} is not a number".format(label)) line_error = True except: err(line_no, "missing label, perhaps an empty line?") line_error = True # check features prev_index = -1 for i in range(len(nodes)): try: (index, value) = nodes[i].split(':') index = int(index) value = my_float(value) # precomputed kernel's index starts from 0 and LIBSVM # checks it. Hence, don't treat index 0 as an error. if index < 0: err(line_no, "feature index must be positive; wrong feature {0}".format(nodes[i])) line_error = True elif index <= prev_index: err(line_no, "feature indices must be in an ascending order, previous/current features {0} {1}".format(nodes[i-1], nodes[i])) line_error = True prev_index = index except: err(line_no, "feature '{0}' not an : pair, integer, real number ".format(nodes[i])) line_error = True line_no += 1 if line_error: error_line_count += 1 if error_line_count > 0: print("Found {0} lines with error.".format(error_line_count)) return 1 else: print("No error.") return 0 if __name__ == "__main__": exit(main())