'''
    create_partitions.py
    
    Creates train/test partitions for classification tasks based on
    label assignments provided by http://www.ifs.tuwien.ac.at/mir/msd/


    Copyright (C) 2012  Alexander Schindler
        Institute of Software Technology and Interactive Systems
        Vienna University of Technology
        Information and Software Engineering Group

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

    Version 1.0
'''

import os
import gzip
import argparse
import operator


if __name__ == '__main__':
    
    
#    # argument handling
#    parser = argparse.ArgumentParser(description='Extract distinct features from an Arff-formatted file')
#    parser.add_argument('--outdir',    '-o', help='Directory to store output feature files', required=True, type=str)
#    parser.add_argument('--featfile',  '-f', help='Arff-formatted feature file', required=True, type=str)
#    parser.add_argument('--splitfile', '-s', help='Split definition file', required=True, type=str)
#    parser.add_argument('--labelfile', '-l', help='File containing track-labels', required=False, type=str)
#
#    # parse arguments
#    args = vars(parser.parse_args())
#    
#    # assign required variables
#    input_file_path   = args['featfile']
#    label_file_path   = args['labelfile']
#    split_file_path   = args['splitfile']
#    destination_path  = args['outdir']
#    
#    #     
#    train_file        = "{0}/{1}".format(destination_path, 
#                                         os.path.split(split_file_path.replace(split_file_path.split(".")[-1], "train.arff"))[-1])
#    test_file         = "{0}/{1}".format(destination_path, 
#                                         os.path.split(split_file_path.replace(split_file_path.split(".")[-1], "test.arff"))[-1])
#    
#    # open output files
#    train             = open(train_file, "w")
#    test              = open(test_file,  "w")
#
#    # load split mapping
#    split_file        = open(split_file_path, 'r')
#    split_mapping     = {}
#    
#    for line in split_file:
#        
#        # skip comments
#        if line[0] == "%":
#            train.write("{0}\n".format(line.rstrip()))
#            test.write( "{0}\n".format(line.rstrip()))
#            continue
#        
#        tmp                   = line.rstrip().split("\t")
#        split_mapping[tmp[0]] = tmp[1]
#        
#    split_file.close()
#
    label_file_path = "D:/MIR/Data/MSD/benchmark/MASD_Multilabel.txt"
    # load class label assignments    
    label_file        = open(label_file_path, 'r')
    first_line        = True
    num_labels        = 0
    
    label_info        = {}
    label_pos         = {}
    
    outfile = open("D:/test/EN0_ML.arff", 'w')
    
    # load labels
    for line in label_file:
        
        # skip header line
        if first_line:
            first_line = False
            num_labels = int(line.rstrip().split(":")[-1])
            continue
        
        info = [0] * num_labels
        
        tmp       = line.strip().split(",")
        track_id  = tmp[0].strip()
        
        for i in range(1,len(tmp)):
            
            label     = tmp[i].strip()

            if not label in label_pos.keys(): 
                
                label_pos[label] = len(label_pos)
            
            info[label_pos[label]] = 1
            
        label_info[track_id] = info
        
    
    label_pos_list = sorted(label_pos.iteritems(), key=operator.itemgetter(1))
    
    outfile.write("@relation 'EN0: -C {0}'\n".format(num_labels))
    
    for item in label_pos_list:
        outfile.write( "@attribute {0} {{0,1}}\n".format(item[0].replace(" ", "_")))
        
        
    label_file.close()
    
    # add newline
#    train.write("\n")
#    test.write("\n")
    
    
    input_file_path = "D:/MIR/Data/MSD/benchmark/features/EN0.arff.gz"
    
    # process feature file
    if input_file_path.split(".")[-1] == "gz":
        track_id_file = gzip.open(input_file_path, 'r')
    else:
        track_id_file = open(input_file_path, 'r')
    
    header       = True
    
    idx = 0
    
    for line in track_id_file:
        
        # copy header        
        if header:
            
            if line.lower().rstrip() == "@data":
                header = False
                
            if line.find("attribute") != -1 and line.find("track_id") == -1:
                outfile.write( line )
            
                
#                for label in labels:
#                    labels_str = "{0},{1}".format(labels_str,label)
#                
#                labels_str = labels_str[1:]
#                
#                train.write("@ATTRIBUTE class {{{0}}}\n".format(labels_str))
#                test.write( "@ATTRIBUTE class {{{0}}}\n".format(labels_str))
                
            else:
                
#                train.write("{0}\n".format(line.rstrip()))
#                test.write( "{0}\n".format(line.rstrip()))
            
                pass
            
            continue
    
        # process feature vector
        track_id = line.lstrip().split(",")[-1].strip()
        
        if len(track_id) < 5:
            track_id = line.lstrip().split(",")[-2].strip()
        
        track_id = track_id.replace("'", "")

        if not label_info.has_key(track_id):
            continue

        label_str   = "{0}".format(label_info[track_id])[1:-1]
        feature_str = "{0}".format(line.strip().replace("'", "").replace(track_id, "").strip()[0:-1])
        output      = "{0},{1}\n".format(label_str,feature_str)
        
        outfile.write(output)
        
        if idx > 10:
            break
        
        idx += 1
        
#        if split_mapping.has_key(track_id) and labels_mapping.has_key(track_id):
#            
#            if split_mapping[track_id].lower()   == "train":
#                train.write("{0}\n".format(line.rstrip().replace(track_id,labels_mapping[track_id])))
#                
#            elif split_mapping[track_id].lower() == "test":
#                test.write( "{0}\n".format(line.rstrip().replace(track_id,labels_mapping[track_id])))
#                
#            else:
#                print "*** ERROR: Unexpected label in track assignment file"
    
    outfile.close()
    
#    train.close()
#    test.close()
#    
    track_id_file.close()