#!/bin/bash # somlib_parser_script # calls modules of SOMLib java parser to create vector representations # # reads: # directory with input files (text collection) # # writes: # somlib input vector file # somlib template vector file # log and summary html file # parser-internal files (hash-tables, histograms) # # 2000-07-21 [AR] # global parameters PROG_NAME="${0##/*/}" NAME="somlib_test" INPUT="inp_dir" MINWORDLENGTH=3 MIN_DF=0.01 MAX_DF=0.6 VERBOSITY=2 HTMLFILE="$NAME.parser.html" LOGFILE="$NAME.parser.log" # check attribute count if [ $# -ne 6 -a $# -ne 2 ] then echo "ERROR: Usage: ${0##/*/} Name InputDir MinWordLength Min_df Max_df Verbosity" >&2 echo " Example: ${0##/*/} somlib_test inp_dir 3 0.01 0.6 2" >&2 echo "or simple version:" echo "ERROR: Usage: ${0##/*/} Name InputDir" >&2 echo " Example: ${0##/*/} somlib_test inp_dir" >&2 exit 1 fi if [ $# = 2 ] then NAME=$1 INPUT=$2 HTMLFILE="$NAME.parser.html" LOGFILE="$NAME.parser.log" fi if [ $# = 6 ] then NAME=$1 INPUT=$2 MINWORDLENGTH=$3 MIN_DF=$4 MAX_DF=$5 VERBOSITY=$6 HTMLFILE="$NAME.parser.html" LOGFILE="$NAME.parser.log" fi rm -rf $NAME.log echo "$NAME" echo "$NAME" >>$LOGFILE date date >>$LOGFILE echo "$PROG_NAME $NAME $INPUT $MINWORDLENGTH $MIN_DF $MAX_DF $VERBOSITY" echo "$PROG_NAME $NAME $INPUT $MINWORDLENGTH $MIN_DF $MAX_DF $VERBOSITY" >>$LOGFILE echo "(Usage: $PROG_NAME Name InputFiles MinWordLength Min_df Max_df Verbosity)" echo "(Usage: $PROG_NAME Name InputFiles MinWordLength Min_df Max_df Verbosity)" >>$LOGFILE echo "------------------------------------------------------" echo "------------------------------------------------------" >>$LOGFILE mkdir parser 2>>$LOGFILE echo "$PROG_NAME: created directory parser/ for parsing files" echo "$PROG_NAME: created directory parser/ for parsing files" >>$LOGFILE mkdir parser/histo 2>>$LOGFILE echo "$PROG_NAME: created directory parser/histo/ for histogram files" echo "$PROG_NAME: created directory parser/histo/ for histogram files" >>$LOGFILE mkdir vectors 2>>$LOGFILE echo "$PROG_NAME: created directory vectors/ for vector files" echo "$PROG_NAME: created directory vectors/ for vector files" >>$LOGFILE ln -s /usr/local/somlib/bin/somlib_java somlib 2>>$LOGFILE echo "$PROG_NAME: created local symbolic link to /usr/local/somlib/bin/somlib_java/ directory" echo "$PROG_NAME: created local symbolic link to /usr/local/somlib/bin/somlib_java/ directory" >>$LOGFILE echo "------------------------------------------------------" echo "------------------------------------------------------" >>$LOGFILE echo "$PROG_NAME: calling" echo " java -Xmx10000m somlib.textrepresentation.wordsexc -i $INPUT -o parser/histo -m $MINWORDLENGTH -v $VERBOSITY" echo "to create wordhistograms" echo "$PROG_NAME: calling" >>$LOGFILE echo " java -Xmx10000m somlib.textrepresentation.wordsexc -i $INPUT -o parser/histo -m $MINWORDLENGTH -v $VERBOSITY" >>$LOGFILE echo "to create wordhistograms" >>$LOGFILE java -Xmx10000m somlib.textrepresentation.wordsexc -i $INPUT -o parser/histo -m $MINWORDLENGTH -v $VERBOSITY >>$LOGFILE 2>>$LOGFILE echo "$PROG_NAME: finished somlib.textrepresentation.wordsexc" echo "------------------------------------------------------" echo "$PROG_NAME: finished somlib.textrepresentation.wordsexc" >>$LOGFILE echo "------------------------------------------------------" >>$LOGFILE echo "$PROG_NAME: calling" echo " java -Xmx10000m somlib.textrepresentation.templatevectorexc -i parser/histo -o parser/$NAME.tv.hash -v $VERBOSITY" echo "to extract template vector" echo "$PROG_NAME: calling" >>$LOGFILE echo " java -Xmx10000m somlib.textrepresentation.templatevectorexc -i parser/histo -o parser/$NAME.tv.hash -v $VERBOSITY" >>$LOGFILE echo "to extract template vector" >>$LOGFILE java -Xmx10000m somlib.textrepresentation.templatevectorexc -i parser/histo -o parser/$NAME.tv.hash -v $VERBOSITY >>$LOGFILE 2>>$LOGFILE echo "$PROG_NAME: finished somlib.textrepresentation.templatevectorexc" echo "------------------------------------------------------" echo "$PROG_NAME: finished somlib.textrepresentation.templatevectorexc" >>$LOGFILE echo "------------------------------------------------------" >>$LOGFILE echo "$PROG_NAME: calling" echo " java -Xmx10000m somlib.textrepresentation.reducerexc -i parser/$NAME.tv.hash -o parser/$NAME.tv.red.hash -n $MIN_DF -x $MAX_DF -r vectors/$NAME.removed.txt -v $VERBOSITY" echo "to create reduced templatevector" echo "$PROG_NAME: calling" >>$LOGFILE echo " java -Xmx10000m somlib.textrepresentation.reducerexc -i parser/$NAME.tv.hash -o parser/$NAME.tv.red.hash -n $MIN_DF -x $MAX_DF -r vectors/$NAME.removed.txt -v $VERBOSITY" >>$LOGFILE echo "to create reduced templatevector" >>$LOGFILE java -Xmx10000m somlib.textrepresentation.reducerexc -i parser/$NAME.tv.hash -o parser/$NAME.tv.red.hash -n $MIN_DF -x $MAX_DF -r vectors/$NAME.removed.txt -v $VERBOSITY >>$LOGFILE 2>>$LOGFILE echo "$PROG_NAME: finished somlib.textrepresentation.reducerexc" echo "------------------------------------------------------" echo "$PROG_NAME: finished somlib.textrepresentation.reducerexc" >>$LOGFILE echo "------------------------------------------------------" >>$LOGFILE echo "$PROG_NAME: calling" echo " java -Xmx10000m somlib.textrepresentation.extractorexc -i parser/histo -j parser/$NAME.tv.red.hash -o parser/vectors/$NAME -f t -v $VERBOSITY" echo "to create individual vectors" echo "$PROG_NAME: calling" >>$LOGFILE echo " java -Xmx10000m somlib.textrepresentation.extractorexc -i parser/histo -j parser/$NAME.tv.red.hash -o parser/vectors/$NAME -f t -v $VERBOSITY" >>$LOGFILE echo "to create individual vectors" >>$LOGFILE java -Xmx10000m somlib.textrepresentation.extractorexc -i parser/histo -j parser/$NAME.tv.red.hash -o vectors/$NAME -t f -b f -f t -v $VERBOSITY >>$LOGFILE 2>>$LOGFILE echo "$PROG_NAME: finished somlib.textrepresentation.extractorexc" echo "------------------------------------------------------" echo "$PROG_NAME: finished somlib.textrepresentation.extractorexc" >>$LOGFILE echo "------------------------------------------------------" >>$LOGFILE echo "$PROG_NAME: doing some postprocessing (sed on tfxidf-file to adapt vector names):" echo "$PROG_NAME: sed 's/parser\/histo\///g' vectors/$NAME.tfxidf | sed 's/.idv//g' > vectors/.$NAME.tfxidf" echo "$PROG_NAME: doing some postprocessing (sed on tfxidf-file to adapt vector names):" >>$LOGFILE echo "$PROG_NAME: sed 's/parser\/histo\///g' vectors/$NAME.tfxidf | sed 's/.idv//g' > vectors/.$NAME.tfxidf" >>$LOGFILE sed 's/parser\/histo\///g' vectors/$NAME.tfxidf | sed 's/.idv//g' > vectors/.$NAME.tfxidf echo "$PROG_NAME: removing old tfxidf file and renaming new tfxidf-file" echo "$PROG_NAME: removing old tfxidf file and renaming new tfxidf-file" >>$LOGFILE rm vectors/$NAME.tfxidf mv vectors/.$NAME.tfxidf vectors/$NAME.tfxidf #echo "$PROG_NAME: calling" #echo " java -Xmx10000m somlib.som.preprocess.Vec2Vec -i vectors/$NAME.tfxidf -o vectors/$NAME.tfxidf.norm -n t -v $VERBOSITY" #echo "to create normalized individual vectors" #echo "$PROG_NAME: calling" >>$LOGFILE #echo "java -Xmx10000m somlib.som.preprocess.Vec2Vec -i vectors/$NAME.tfxidf -o vectors/$NAME.tfxidf.norm -n t -v $VERBOSITY" >>$LOGFILE #echo "to create normalized individual vectors" >>$LOGFILE #java -Xmx10000m somlib.som.preprocess.Vec2Vec -i vectors/$NAME.tfxidf -o vectors/$NAME.tfxidf.norm -n t -v $VERBOSITY >>$LOGFILE 2>>$LOGFILE #echo "$PROG_NAME: finished somlib.som.preprocess.Vec2Vec" #echo "------------------------------------------------------" #echo "$PROG_NAME: finished somlib.som.preprocess.Vec2Vec" >>$LOGFILE #echo "------------------------------------------------------" >>$LOGFILE #gzip vectors/$NAME.tfxidf 2>>$LOGFILE & ############################################################### echo "$PROG_NAME: creating html file $HTMLFILE" echo "$PROG_NAME: creating html file $HTMLFILE" >>$LOGFILE # create index html file echo "" >>$HTMLFILE echo "" >>$HTMLFILE echo "SOMLib: Experiments: $NAME" >>$HTMLFILE echo "" >>$HTMLFILE echo "" >>$HTMLFILE echo "
" >>$HTMLFILE echo "Department of Software Technology
" >>$HTMLFILE echo "Vienna University of Technology

" >>$HTMLFILE echo "
" >>$HTMLFILE echo "

" >>$HTMLFILE echo "
" >>$HTMLFILE echo "

SOMLib: Experiments: $NAME

" >>$HTMLFILE echo "
" >>$HTMLFILE echo "

Data

" >>$HTMLFILE echo "" >>$HTMLFILE echo "

" >>$HTMLFILE echo "

Parser

" >>$HTMLFILE echo "" >>$HTMLFILE echo "

" >>$HTMLFILE echo "


" >>$HTMLFILE echo "Up
" >>$HTMLFILE echo "
" >>$HTMLFILE echo "Comments: rauber@ifs.tuwien.ac.at" >>$HTMLFILE echo "

" >>$HTMLFILE echo "" >>$HTMLFILE echo "" >>$HTMLFILE ############################################################### echo "------------------------------------------------------" echo "------------------------------------------------------" >>$LOGFILE echo "ls -al" echo "ls -al" >>$LOGFILE ls -al ls -al >>$LOGFILE echo "ls -al vectors/*" echo "ls -al vectors/*" >>$LOGFILE ls -al vectors/* ls -al vectors/* >>$LOGFILE echo "------------------------------------------------------" echo "------------------------------------------------------" >>$LOGFILE echo "content parser done" echo "content parser done" >>$LOGFILE date date >>$LOGFILE echo "------------------------------------------------------" echo "------------------------------------------------------" >>$LOGFILE