'''
extract_features_from_arff.py
Creates train/test partitions for classification tasks based on
label assignments provided by http://www.ifs.tuwien.ac.at/mir/msd/
Copyright (C) 2012 Alexander Schindler
Institute of Software Technology and Interactive Systems
Vienna University of Technology
Information and Software Engineering Group
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
Version 1.0
'''
import gzip
import argparse
def usage():
print "\n extract_features_from_arff.py FEATURE_FILE LABEL_FILE SPLIT_FILE "
def createFeatureMask(selected_features_str, attrs):
mask = []
# initialize mask
for i in range(len(attrs)):
mask.append(False)
# get components
for comp in selected_features_str.split(","):
comp = comp.strip()
# check if this is a single value or interval definition
if comp.find(":") != -1:
# this is an interval definition
interval_start = int(comp.split(":")[0])
interval_end = int(comp.split(":")[-1])
# set values for this interval
for i in range(interval_start - 1, interval_end):
mask[i] = True
else:
mask[int(comp) - 1] = True # single value definition
return mask
if __name__ == '__main__':
# argument handling
parser = argparse.ArgumentParser(description='Extract distinct features from an Arff-formatted file')
parser.add_argument('--list', help='List all attributes of the provided file', action='store_true')
parser.add_argument('--input', '-i', help='Input feature file', required=True, type=str)
parser.add_argument('--output', '-o', help='Output feature file', required=True, type=str)
parser.add_argument('--features', '-f', help='comma separated list of attribute indexes (e.g. 1:3,5,10)', required=True, type=str)
parser.add_argument('--relation', '-r', help='Relation Name', required=False, type=str)
# parse arguments
args = vars(parser.parse_args())
# assign required variables
input_file_path = args['input']
output_file_path = args['output']
selected_features_str = args['features']
realation_name = args['relation']
# declare variables
header = True
attrs = []
mask = None
attrID = 1
curr_line = 1
# process feature file
if input_file_path.split(".")[-1] == "gz":
track_id_file = gzip.open(input_file_path, 'r')
else:
track_id_file = open(input_file_path, 'r')
# process feature file
if output_file_path.split(".")[-1] == "gz":
output_file = gzip.open(output_file_path, 'w')
else:
output_file = open(output_file_path, 'w')
# scan through file
for line in track_id_file:
line = line.strip()
if curr_line % 1000 == 0:
print curr_line
curr_line += 1
# copy header
if header:
# check if end of header reached
if line.lower().rstrip() == "@data":
# data tag reached => end of header
header = False
# create feature mask
mask = createFeatureMask(selected_features_str, attrs)
# display filtered features
if not args['list']:
output_file.write("@Relation {0}\n".format(realation_name))
for i in range(len(attrs)):
if (mask[i]):
output_file.write("@attribute {0} {1}\n".format(attrs[i][0],attrs[i][1]))
print (i+1), attrs[i]
output_file.write("@Data\n")
if args['list']:
for i in range(len(attrs)):
print "[{0}] {1} ({2})".format(i+1,attrs[i][0],attrs[i][1])
exit()
# check if line is attribute declaration
if line.lower().rstrip().find("@relation") != -1:
if realation_name == None:
realation_name = line.strip().split(" ")[1].strip()
elif line.lower().rstrip().find("@attribute") != -1:
tmp = line.strip().split(" ")
attrs.append([tmp[1].strip(),tmp[2].strip()])
continue
# process feature vector
comp = line.strip().split(",")
feature_data_blob = ""
for i in range(len(attrs)):
if (mask[i]):
feature_data_blob = "{0},{1}".format(feature_data_blob, comp[i])
output_file.write("{0}\n".format(feature_data_blob[1:].strip()))
output_file.close()