'''
    extract_features_from_arff.py
    
    Creates train/test partitions for classification tasks based on
    label assignments provided by http://www.ifs.tuwien.ac.at/mir/msd/


    Copyright (C) 2012  Alexander Schindler
        Institute of Software Technology and Interactive Systems
        Vienna University of Technology
        Information and Software Engineering Group

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

    Version 1.0


'''

import gzip
import argparse


def usage():
    
    print "\n    extract_features_from_arff.py FEATURE_FILE LABEL_FILE SPLIT_FILE <dest_dir>"


def createFeatureMask(selected_features_str, attrs):
    
    mask = []
    
    # initialize mask
    for i in range(len(attrs)):
        mask.append(False)
    
    # get components
    for comp in selected_features_str.split(","):
        
        comp = comp.strip()
        
        # check if this is a single value or interval definition
        if comp.find(":") != -1:
        
            # this is an interval definition
            interval_start = int(comp.split(":")[0])
            interval_end = int(comp.split(":")[-1])
            
            # set values for this interval
            for i in range(interval_start - 1, interval_end):
                mask[i] = True
        
        else:
            
            mask[int(comp) - 1] = True # single value definition
    
    return mask

if __name__ == '__main__':
    
    # argument handling
    parser = argparse.ArgumentParser(description='Extract distinct features from an Arff-formatted file')
    parser.add_argument('--list', help='List all attributes of the provided file', action='store_true')
    parser.add_argument('--input', '-i',    help='Input feature file',  required=True, type=str)
    parser.add_argument('--output', '-o',   help='Output feature file', required=True, type=str)
    parser.add_argument('--features', '-f', help='comma separated list of attribute indexes (e.g. 1:3,5,10)', required=True, type=str)
    parser.add_argument('--relation', '-r', help='Relation Name', required=False, type=str)

    # parse arguments
    args = vars(parser.parse_args())
    
    # assign required variables
    input_file_path       = args['input']
    output_file_path      = args['output']
    selected_features_str = args['features']
    realation_name        = args['relation']
    
    # declare variables
    header       = True
    attrs        = []
    mask         = None
    attrID       = 1
    curr_line    = 1
    
    # process feature file
    if input_file_path.split(".")[-1] == "gz":
        track_id_file = gzip.open(input_file_path, 'r')
    else:
        track_id_file = open(input_file_path, 'r')
        
    # process feature file
    if output_file_path.split(".")[-1] == "gz":
        output_file = gzip.open(output_file_path, 'w')
    else:
        output_file = open(output_file_path, 'w')
    
    # scan through file
    for line in track_id_file:
        
        line = line.strip()
        
        if curr_line % 1000 == 0:
            print curr_line
            
        curr_line += 1
        
        # copy header        
        if header:
            
            # check if end of header reached
            if line.lower().rstrip() == "@data":
                
                # data tag reached => end of header
                header = False
                
                # create feature mask
                mask = createFeatureMask(selected_features_str, attrs)
                
                # display filtered features
                if not args['list']:
                    
                    output_file.write("@Relation {0}\n".format(realation_name))
                                        
                    for i in range(len(attrs)):
                        if (mask[i]):
                            output_file.write("@attribute {0} {1}\n".format(attrs[i][0],attrs[i][1]))                    
                            print (i+1), attrs[i]
                            
                    output_file.write("@Data\n")                    
            
                if args['list']:
                    for i in range(len(attrs)):
                        print "[{0}] {1} ({2})".format(i+1,attrs[i][0],attrs[i][1])
                    exit()
                    
            # check if line is attribute declaration
            if line.lower().rstrip().find("@relation") != -1:
                if realation_name == None:
                    realation_name = line.strip().split(" ")[1].strip()
            elif line.lower().rstrip().find("@attribute") != -1:
                tmp = line.strip().split(" ")
                attrs.append([tmp[1].strip(),tmp[2].strip()])
                
            continue
        
            
        # process feature vector
        comp = line.strip().split(",")
        
        feature_data_blob = ""
        
        for i in range(len(attrs)):
            if (mask[i]):
                feature_data_blob = "{0},{1}".format(feature_data_blob, comp[i])
        
        output_file.write("{0}\n".format(feature_data_blob[1:].strip()))
        
    output_file.close()