''' extract_features_from_arff.py Creates train/test partitions for classification tasks based on label assignments provided by http://www.ifs.tuwien.ac.at/mir/msd/ Copyright (C) 2012 Alexander Schindler Institute of Software Technology and Interactive Systems Vienna University of Technology Information and Software Engineering Group This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Version 1.0 ''' import gzip import argparse def usage(): print "\n extract_features_from_arff.py FEATURE_FILE LABEL_FILE SPLIT_FILE " def createFeatureMask(selected_features_str, attrs): mask = [] # initialize mask for i in range(len(attrs)): mask.append(False) # get components for comp in selected_features_str.split(","): comp = comp.strip() # check if this is a single value or interval definition if comp.find(":") != -1: # this is an interval definition interval_start = int(comp.split(":")[0]) interval_end = int(comp.split(":")[-1]) # set values for this interval for i in range(interval_start - 1, interval_end): mask[i] = True else: mask[int(comp) - 1] = True # single value definition return mask if __name__ == '__main__': # argument handling parser = argparse.ArgumentParser(description='Extract distinct features from an Arff-formatted file') parser.add_argument('--list', help='List all attributes of the provided file', action='store_true') parser.add_argument('--input', '-i', help='Input feature file', required=True, type=str) parser.add_argument('--output', '-o', help='Output feature file', required=True, type=str) parser.add_argument('--features', '-f', help='comma separated list of attribute indexes (e.g. 1:3,5,10)', required=True, type=str) parser.add_argument('--relation', '-r', help='Relation Name', required=False, type=str) # parse arguments args = vars(parser.parse_args()) # assign required variables input_file_path = args['input'] output_file_path = args['output'] selected_features_str = args['features'] realation_name = args['relation'] # declare variables header = True attrs = [] mask = None attrID = 1 curr_line = 1 # process feature file if input_file_path.split(".")[-1] == "gz": track_id_file = gzip.open(input_file_path, 'r') else: track_id_file = open(input_file_path, 'r') # process feature file if output_file_path.split(".")[-1] == "gz": output_file = gzip.open(output_file_path, 'w') else: output_file = open(output_file_path, 'w') # scan through file for line in track_id_file: line = line.strip() if curr_line % 1000 == 0: print curr_line curr_line += 1 # copy header if header: # check if end of header reached if line.lower().rstrip() == "@data": # data tag reached => end of header header = False # create feature mask mask = createFeatureMask(selected_features_str, attrs) # display filtered features if not args['list']: output_file.write("@Relation {0}\n".format(realation_name)) for i in range(len(attrs)): if (mask[i]): output_file.write("@attribute {0} {1}\n".format(attrs[i][0],attrs[i][1])) print (i+1), attrs[i] output_file.write("@Data\n") if args['list']: for i in range(len(attrs)): print "[{0}] {1} ({2})".format(i+1,attrs[i][0],attrs[i][1]) exit() # check if line is attribute declaration if line.lower().rstrip().find("@relation") != -1: if realation_name == None: realation_name = line.strip().split(" ")[1].strip() elif line.lower().rstrip().find("@attribute") != -1: tmp = line.strip().split(" ") attrs.append([tmp[1].strip(),tmp[2].strip()]) continue # process feature vector comp = line.strip().split(",") feature_data_blob = "" for i in range(len(attrs)): if (mask[i]): feature_data_blob = "{0},{1}".format(feature_data_blob, comp[i]) output_file.write("{0}\n".format(feature_data_blob[1:].strip())) output_file.close()