''' create_partitions.py Creates train/test partitions for classification tasks based on label assignments provided by http://www.ifs.tuwien.ac.at/mir/msd/ Copyright (C) 2012 Alexander Schindler Institute of Software Technology and Interactive Systems Vienna University of Technology Information and Software Engineering Group This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Version 1.0 ''' import os import gzip import argparse if __name__ == '__main__': # argument handling parser = argparse.ArgumentParser(description='Extract distinct features from an Arff-formatted file') parser.add_argument('--outdir', '-o', help='Directory to store output feature files', required=True, type=str) parser.add_argument('--featfile', '-f', help='Arff-formatted feature file', required=True, type=str) parser.add_argument('--splitfile', '-s', help='Split definition file', required=True, type=str) parser.add_argument('--labelfile', '-l', help='File containing track-labels', required=False, type=str) # parse arguments args = vars(parser.parse_args()) # assign required variables input_file_path = args['featfile'] label_file_path = args['labelfile'] split_file_path = args['splitfile'] destination_path = args['outdir'] # train_file = "{0}/{1}".format(destination_path, os.path.split(split_file_path.replace(split_file_path.split(".")[-1], "train.arff"))[-1]) test_file = "{0}/{1}".format(destination_path, os.path.split(split_file_path.replace(split_file_path.split(".")[-1], "test.arff"))[-1]) # open output files train = open(train_file, "w") test = open(test_file, "w") # load split mapping split_file = open(split_file_path, 'r') split_mapping = {} for line in split_file: # skip comments if line[0] == "%": train.write("{0}\n".format(line.rstrip())) test.write( "{0}\n".format(line.rstrip())) continue tmp = line.rstrip().split("\t") split_mapping[tmp[0]] = tmp[1] split_file.close() # load class label assignments label_file = open(label_file_path, 'r') labels_mapping = {} labels = set() skip_line = True # load labels for line in label_file: # skip header line if skip_line: skip_line = False continue tmp = line.rstrip().split("\t") labels_mapping[tmp[0]] = tmp[1] labels.add(tmp[1]) label_file.close() # add newline train.write("\n") test.write("\n") # process feature file if input_file_path.split(".")[-1] == "gz": track_id_file = gzip.open(input_file_path, 'r') else: track_id_file = open(input_file_path, 'r') header = True for line in track_id_file: # copy header if header: if line.lower().rstrip() == "@data": header = False if line.find("string") != -1: labels_str = "" for label in labels: labels_str = "{0},{1}".format(labels_str,label) labels_str = labels_str[1:] train.write("@ATTRIBUTE class {{{0}}}\n".format(labels_str)) test.write( "@ATTRIBUTE class {{{0}}}\n".format(labels_str)) else: train.write("{0}\n".format(line.rstrip())) test.write( "{0}\n".format(line.rstrip())) continue # process feature vector track_id = line.lstrip().split(",")[-1].strip() if len(track_id) < 5: track_id = line.lstrip().split(",")[-2].strip() if track_id[0].lstrip() == "'": track_id = track_id.lstrip()[1:-1] if split_mapping.has_key(track_id) and labels_mapping.has_key(track_id): if split_mapping[track_id].lower() == "train": train.write("{0}\n".format(line.rstrip().replace(track_id,labels_mapping[track_id]))) elif split_mapping[track_id].lower() == "test": test.write( "{0}\n".format(line.rstrip().replace(track_id,labels_mapping[track_id]))) else: print "*** ERROR: Unexpected label in track assignment file" train.close() test.close() track_id_file.close()