''' create_partitions.py Creates train/test partitions for classification tasks based on label assignments provided by http://www.ifs.tuwien.ac.at/mir/msd/ Copyright (C) 2012 Alexander Schindler Institute of Software Technology and Interactive Systems Vienna University of Technology Information and Software Engineering Group This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Version 1.0 ''' import os import gzip import argparse import operator if __name__ == '__main__': # # argument handling # parser = argparse.ArgumentParser(description='Extract distinct features from an Arff-formatted file') # parser.add_argument('--outdir', '-o', help='Directory to store output feature files', required=True, type=str) # parser.add_argument('--featfile', '-f', help='Arff-formatted feature file', required=True, type=str) # parser.add_argument('--splitfile', '-s', help='Split definition file', required=True, type=str) # parser.add_argument('--labelfile', '-l', help='File containing track-labels', required=False, type=str) # # # parse arguments # args = vars(parser.parse_args()) # # # assign required variables # input_file_path = args['featfile'] # label_file_path = args['labelfile'] # split_file_path = args['splitfile'] # destination_path = args['outdir'] # # # # train_file = "{0}/{1}".format(destination_path, # os.path.split(split_file_path.replace(split_file_path.split(".")[-1], "train.arff"))[-1]) # test_file = "{0}/{1}".format(destination_path, # os.path.split(split_file_path.replace(split_file_path.split(".")[-1], "test.arff"))[-1]) # # # open output files # train = open(train_file, "w") # test = open(test_file, "w") # # # load split mapping # split_file = open(split_file_path, 'r') # split_mapping = {} # # for line in split_file: # # # skip comments # if line[0] == "%": # train.write("{0}\n".format(line.rstrip())) # test.write( "{0}\n".format(line.rstrip())) # continue # # tmp = line.rstrip().split("\t") # split_mapping[tmp[0]] = tmp[1] # # split_file.close() # label_file_path = "D:/MIR/Data/MSD/benchmark/MASD_Multilabel.txt" # load class label assignments label_file = open(label_file_path, 'r') first_line = True num_labels = 0 label_info = {} label_pos = {} outfile = open("D:/test/EN0_ML.arff", 'w') # load labels for line in label_file: # skip header line if first_line: first_line = False num_labels = int(line.rstrip().split(":")[-1]) continue info = [0] * num_labels tmp = line.strip().split(",") track_id = tmp[0].strip() for i in range(1,len(tmp)): label = tmp[i].strip() if not label in label_pos.keys(): label_pos[label] = len(label_pos) info[label_pos[label]] = 1 label_info[track_id] = info label_pos_list = sorted(label_pos.iteritems(), key=operator.itemgetter(1)) outfile.write("@relation 'EN0: -C {0}'\n".format(num_labels)) for item in label_pos_list: outfile.write( "@attribute {0} {{0,1}}\n".format(item[0].replace(" ", "_"))) label_file.close() # add newline # train.write("\n") # test.write("\n") input_file_path = "D:/MIR/Data/MSD/benchmark/features/EN0.arff.gz" # process feature file if input_file_path.split(".")[-1] == "gz": track_id_file = gzip.open(input_file_path, 'r') else: track_id_file = open(input_file_path, 'r') header = True idx = 0 for line in track_id_file: # copy header if header: if line.lower().rstrip() == "@data": header = False if line.find("attribute") != -1 and line.find("track_id") == -1: outfile.write( line ) # for label in labels: # labels_str = "{0},{1}".format(labels_str,label) # # labels_str = labels_str[1:] # # train.write("@ATTRIBUTE class {{{0}}}\n".format(labels_str)) # test.write( "@ATTRIBUTE class {{{0}}}\n".format(labels_str)) else: # train.write("{0}\n".format(line.rstrip())) # test.write( "{0}\n".format(line.rstrip())) pass continue # process feature vector track_id = line.lstrip().split(",")[-1].strip() if len(track_id) < 5: track_id = line.lstrip().split(",")[-2].strip() track_id = track_id.replace("'", "") if not label_info.has_key(track_id): continue label_str = "{0}".format(label_info[track_id])[1:-1] feature_str = "{0}".format(line.strip().replace("'", "").replace(track_id, "").strip()[0:-1]) output = "{0},{1}\n".format(label_str,feature_str) outfile.write(output) if idx > 10: break idx += 1 # if split_mapping.has_key(track_id) and labels_mapping.has_key(track_id): # # if split_mapping[track_id].lower() == "train": # train.write("{0}\n".format(line.rstrip().replace(track_id,labels_mapping[track_id]))) # # elif split_mapping[track_id].lower() == "test": # test.write( "{0}\n".format(line.rstrip().replace(track_id,labels_mapping[track_id]))) # # else: # print "*** ERROR: Unexpected label in track assignment file" outfile.close() # train.close() # test.close() # track_id_file.close()