pax_global_header00006660000000000000000000000064132604372710014517gustar00rootroot0000000000000052 comment=ad986f7809107440cf4f4df412bf24e01e310c8f herisvm-0.8.2/000077500000000000000000000000001326043727100132035ustar00rootroot00000000000000herisvm-0.8.2/Makefile000066400000000000000000000002341326043727100146420ustar00rootroot00000000000000PROJECTNAME = herisvm SUBPRJ = doc scripts:tests MKC_REQD = 0.28.0 NODEPS = *:test-tests test : all-tests test-tests @: .include herisvm-0.8.2/README000066400000000000000000000006051326043727100140640ustar00rootroot00000000000000herisvm project is a collection of simple tools implementing evaluation algorithms for classification (machine learning). In particular heri-eval implements N-fold cross-validation where training and testing is run in parallel. This may be useful if you use multi-CPU computer. Run heri-eval -h, heri-stat -h and heri-split -h for documentation and examples. Also see doc/ subdirectory. herisvm-0.8.2/doc/000077500000000000000000000000001326043727100137505ustar00rootroot00000000000000herisvm-0.8.2/doc/INSTALL000066400000000000000000000007641326043727100150100ustar00rootroot00000000000000Build time dependencies: -- mk-configure (https://github.com/cheusov/mk-configure) is needed for building and installing the project -- pod2man script Runtime dependencies: -- bash -- ruby>=1.9.3 -- modern awk (gnu awk and nawk are good enough) Examples of how to build # cd herisvm-x.y.z # mkcmake all # mkcmake install or $ cd herisvm-x.y.z $ export PREFIX=/usr MANDIR=/usr/share/man SYSCONFDIR=/etc $ mkcmake all $ mkcmake install DESTDIR=/tmp/destdir herisvm-0.8.2/doc/LICENSE000066400000000000000000000021551326043727100147600ustar00rootroot00000000000000Copyright (c) 2015 Alexandra Figlovskaya Copyright (c) 2015 Aleksey Cheusov Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. herisvm-0.8.2/doc/Makefile000066400000000000000000000001701326043727100154060ustar00rootroot00000000000000FILES = LICENSE NEWS ../README TODO FILESDIR = ${DOCDIR} DOCDIR ?= ${DATADIR}/doc/herisvm .include herisvm-0.8.2/doc/NEWS000066400000000000000000000010111326043727100144400ustar00rootroot00000000000000====================================================================== Version 0.8.2, Mon, 2 Apr 2018 17:29:07 +0300 heri-stat-addons: yet another fix for format string for max_dev ====================================================================== Version 0.8.1, Thu, 15 Mar 2018 15:28:46 +0300 heri-stat-addons: format string for max_dev was fixed ====================================================================== Version 0.1.0, Sat, 13 Jun 2015 12:53:02 +0300 initial publicly available release herisvm-0.8.2/doc/TODO000066400000000000000000000003321326043727100144360ustar00rootroot00000000000000* heri-eval: - heri-eval -T: target class - Repeated random sub-sampling heri-eval -t 10 -r 60 ... - Alternative formats (crfsuite) for heri-split - Support for IE (no classes, just information extraction) herisvm-0.8.2/scripts/000077500000000000000000000000001326043727100146725ustar00rootroot00000000000000herisvm-0.8.2/scripts/Makefile000066400000000000000000000002211326043727100163250ustar00rootroot00000000000000SCRIPTS = heri-eval heri-split heri-stat heri-stat-addons MAN = heri-eval.1 heri-split.1 heri-stat.1 CLEANFILES = ${MAN} .include herisvm-0.8.2/scripts/heri-eval000077500000000000000000000227211326043727100165000ustar00rootroot00000000000000#!/usr/bin/env bash # Copyright (c) 2015 Alexandra Figlovskaya # Copyright (c) 2015-2017 Aleksey Cheusov # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # variables settable by user : ${SVM_TRAIN_CMD:=svm-train} : ${SVM_PREDICT_CMD:=svm-predict} : ${SVM_HERI_STAT_CMD:=heri-stat} : ${SVM_HERI_STAT_ADDONS_CMD:=heri-stat-addons} : ${SVM_HERI_SPLIT_CMD:=heri-split} : ${TMPDIR:=/tmp} ############################################################ set -e export LC_ALL=C indent2 (){ sed '/./ s/^/ /' "$@" } sig_handler (){ on_exit trap - "$1" kill -"$1" $$ } on_exit(){ show_stderr if test -z "$keep_tmp"; then if test -n "$tmp_dir"; then rm -rf "$tmp_dir" fi else echo "Temporary files are here $tmp_dir" 1>&2 fi } calculate_feature_count (){ awk '{ for (i=2; i <= NF; ++i) { if ($i + 0 > m) m = $i + 0 } } END { print m+1 }' "$@" } calculate_feature_count (){ awk '{ for (i=2; i <= NF; ++i) { if ($i + 0 > m) m = $i + 0 } } END { print m+1 }' "$@" } results_from_testing_sets (){ if ! test -s "$tmp_dir/testing_fold.txt"; then cat "$tmp_dir/result_single1.txt" return fi awk ' FNR == NR { # reading testing_fold.txt ++obj_num[$1] testobj[$1,obj_num[$1]] = NR next } # reading results on testing folds FNR == 1 { ++fold_num } { idx = testobj[fold_num, FNR] result [idx] = $0 } END { if ((NR % 2) != 0){ print "internal error!" > "/dev/stderr" exit 12 } count = NR/2 for (i=1; i <= count; ++i){ print result [i] } }' "$tmp_dir/testing_fold.txt" $result_all } show_stderr (){ if test -z "$last"; then return fi for i in `seq $last`; do # fn="$tmp_dir/train_stderr${i}" if test -s "$fn"; then echo "---- train stderr $i ----" 1>&2 cat -- "$fn" 1>&2 fi # fn="$tmp_dir/predict_stderr${i}" if test -s "$fn"; then echo "---- predict stderr $i ----" 1>&2 cat -- "$fn" 1>&2 fi done } wait_all (){ local i local ex ex=0 for i in `seq $last`; do if wait ${pid[$i]}; then : else ex=$? fi done return "$ex" } # heri-eval -t10 -n 5 dataset.libsvm # 10*5-fold cross-validation usage(){ cat 1>&2 <<'EOF' usage: heri-eval [OPTIONS] training_set [-- SVM_TRAIN_OPTIONS] Examples: heri-eval -n5 dataset.libsvm # 5-fold cross-validation heri-eval -e testing.libsvm training.libsvm # testing on testing.libsvm OPTIONS: -h Help message -n N The number of folds for T*N-fold cross-validation -t T The number of runs for T*N-fold cross-validation -e testing_set Testing set for hold-out -o Save results from testing sets to the specified file (golden_tag result_tag [score]) -O Save incorrectly classified objects to the specified file (#object_number: golden_tag result_tag [score]) -m Save confusion matrix to the specified file (frequency : golden_tag result_tag) -f Enable output of per-fold statistics (see -Mf) -M Output mode: t -- output total statistics, f -- output per-fold statistics, c -- output cross-fold statistics. -s Options passed to heri-split(1) -p Options passed to heri-stat(1) -S Seed value passed to heri-split(1). If it is not specified, the dataset is splitted into training and testing datasets randomly. -K Keep temporary directory after exiting -D Debugging mode, implies -K SVM_TRAIN_OPTIONS: options passed to svm-train(1) and alike Environment variables: SVM_TRAIN_CMD -- training utility, e.g., liblinear-train (the default is svm-train) SVM_PREDICT_CMD -- predicting utility, e.g., liblinear-predict (the default is svm-predict) TMPDIR -- temporary directory (the default is /tmp) Examples: Ex1: heri-eval -e testing_set.libsvm training_set.libsvm -- -s 0 -t 0 Ex1: export SVM_TRAIN_CMD='liblinear-train' export SVM_PREDICT_CMD='liblinear-predict' heri-eval -p '-mr' -v 5 training_set.libsvm -- -s 4 -q EOF } runs=1 output_mode=tc times=1 while getopts De:fhKm:M:n:o:O:p:s:S:t: f; do case "$f" in '?') usage exit 1;; h) usage exit 0;; n) number_of_folds="$OPTARG";; e) testing_set="$OPTARG";; t) times="$OPTARG";; m) confusion_matrix="$OPTARG";; o) results="$OPTARG";; O) incorrect_results="$OPTARG";; s) herisplit_args="$herisplit_args $OPTARG";; p) heristat_args="$heristat_args $OPTARG";; f) output_mode="f$output_mode";; M) output_mode="$OPTARG";; S) seed="$OPTARG";; K) keep_tmp=1;; D) keep_tmp=1 debug=1;; esac done shift `expr $OPTIND - 1` while test "$#" -gt 0; do case "$1" in --) shift break;; *) print_sh=`printf '%q' "$1"` files="$files $print_sh" shift;; esac done trap "sig_handler INT" INT trap "on_exit" 0 if test -z "$number_of_folds" -a -z "$testing_set"; then echo 'Either -v or -e must be specified, run heri-eval -h for details' 1>&2 exit 1 fi if test -z "$files"; then echo 'Training set is mandatory, run heri-eval -h for details' 1>&2 exit 1 fi tmp_dir=`mktemp -d $TMPDIR/svm.XXXXXX` training_testing (){ if test -n "$number_of_folds"; then ${SVM_HERI_SPLIT_CMD} $herisplit_args -c "$number_of_folds" -d "$tmp_dir" -s "$seed" $files if test -n "$seed"; then seed="${seed}9876" fi last="$number_of_folds" else eval "cat -- $files" > "$tmp_dir/train1.txt" cp "$testing_set" "$tmp_dir/test1.txt" last=1 fi for i in `seq $last`; do ${SVM_TRAIN_CMD} "$@" "$tmp_dir/train$i.txt" "$tmp_dir/svm$i.bin" \ 2> "$tmp_dir/train_stderr${i}" \ > "$tmp_dir/train_stdout${i}" & pid[$i]=$! done wait_all for i in `seq $last`; do ${SVM_PREDICT_CMD} "$tmp_dir/test$i.txt" "$tmp_dir/svm$i.bin" \ "$tmp_dir/result${i}.txt" \ 2> "$tmp_dir/predict_stderr${i}" \ > "$tmp_dir/predict_stdout${i}" & pid[$i]=$! done wait_all rm -f "$tmp_dir/golden_tags" "$tmp_dir/result.txt" } show_stat (){ for t in `seq $times`; do result_all='' for i in `seq $last`; do awk '{print $1}' "$tmp_dir/test${t}_$i.txt" > "$tmp_dir/golden_tags${t}_${i}" if [[ "_$output_mode" =~ f ]]; then echo "Fold ${t}x$i statistics" ${SVM_HERI_STAT_CMD} $heristat_args \ "$tmp_dir/golden_tags${t}_${i}" "$tmp_dir/result${t}_${i}.txt" | indent2 echo '' fi ${SVM_HERI_STAT_CMD} -R \ "$tmp_dir/golden_tags${t}_${i}" "$tmp_dir/result${t}_${i}.txt" \ > "$tmp_dir/evaluation${t}_${i}.txt" paste "$tmp_dir/golden_tags${t}_${i}" "$tmp_dir/result${t}_${i}.txt" | \ tr ' ' ' ' > "$tmp_dir/result_single${t}_${i}.txt" ln -f "$tmp_dir/result_single${t}_${i}.txt" "$tmp_dir/result_single${i}.txt" result_all="$result_all $tmp_dir/result_single${i}.txt" done done } export HERISVM_FC=`calculate_feature_count $files` for t in `seq $times`; do training_testing "$@" # ls -l "$tmp_dir/" for i in `seq $last`; do ln "$tmp_dir/test${i}.txt" "$tmp_dir/test${t}_$i.txt" ln "$tmp_dir/result${i}.txt" "$tmp_dir/result${t}_${i}.txt" done # rm "$tmp_dir/test${i}.txt" "$tmp_dir/result${i}.txt" done #echo before test #ls -l "$tmp_dir" show_stat #echo after test results_from_testing_sets > "$tmp_dir/result.txt" # -o if test -n "$results"; then cp "$tmp_dir/result.txt" "$results" fi # -O if test -n "$incorrect_results"; then awk '$1 != $2 {print "#" NR, $0}' "$tmp_dir/result.txt" \ > "$incorrect_results" fi # -m if test -n "$confusion_matrix"; then awk '$1 != $2' "$tmp_dir/result.txt" | sort | uniq -c | sort -rn | awk '{print $1, ":", $2, $3}' > "$confusion_matrix" fi # if [[ "_$output_mode" =~ t ]]; then echo 'Total statistics' ${SVM_HERI_STAT_CMD} -1 $heristat_args "$tmp_dir"/result_single*_*.txt | indent2 echo '' fi if test -n "$number_of_folds" && [[ "_$output_mode" =~ c ]]; then echo 'Total cross-folds statistics' ${SVM_HERI_STAT_ADDONS_CMD} "$tmp_dir"/evaluation*.txt | indent2 fi herisvm-0.8.2/scripts/heri-eval.pod000066400000000000000000000047101326043727100172540ustar00rootroot00000000000000=head1 NAME heri-eval - evaluate classification algorithm =head1 SYNOPSIS B [OPTIONS] I [-- SVM_TRAIN_OPTIONS] =head1 DESCRIPTION B runs training algorithm on I and then evaluate it using testing set, specified by option I<-e>. Alternatively, cross-validation is run, if option I<-n> was applied. If cross-validation is used, training and testing on different folds are run in parallel, thus utilizing available CPUs. =head1 OPTIONS =over 6 =item B<-h, --help> Display help information. =item B<-f> Enable output of per-fold statistics. See B<-M>I. =item B<-n> I Enable T*I-fold cross-validation mode and set the number of folds to I. =item B<-t> I Enable I*N-fold cross-validation mode and set the number of runs to I which 1 by default. =item B<-e> I Enable hold-out mode and set the testing dataset. =item B<-o> I Save results from testing sets to the specified file. Format: golden_class result_class [score] =item B<-O> I Save incorrectly classified objects to the specified file. Format: #object_number: golden_class result_class [score]) =item B<-m> I Save confusion matrix to the specified file. Format: frequency : golden_class result_class =item B<-p> I Pass the specified I to B. =item B<-s> I Pass the specified I to B. =item B<-M> I Sets the output mode where chars are: t -- output total statistics, f -- output per-fold statistics, c -- output cross-fold statistics. The default is "-M tc". =item B<-S> I Pass the specified I to B. =item B<-K> Keep temporary directory after exiting. =item B<-D> Turn on the debugging mode, implies -K. =back =head1 ENVIRONMENT =over 6 =item I Training utility, e.g., liblinear-train (the default is svm-train). =item I Predicting utility, e.g., liblinear-predict (the default is svm-predict). =item I Utility for calculating statistics (the default is B). =item I Utility for calculating additional statistics (the default is B). =item I Utility for splitting the dataset (the default is B). =item I Temporary directory (the default is /tmp). =back =head1 HOME L =head1 SEE ALSO L L herisvm-0.8.2/scripts/heri-split000077500000000000000000000107211326043727100167010ustar00rootroot00000000000000#!/usr/bin/env ruby # Copyright (c) 2015 Alexandra Figlovskaya # Copyright (c) 2015-2017 Aleksey Cheusov # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. require 'optparse' $options = {} $fold_cnt = nil $tmp_dir = nil $seed = Random.new_seed $stratified = true OptionParser.new do |opts| opts.banner = < [OPTIONS] I [I...] =head1 DESCRIPTION B splits the dataset into several training and testing sets as it is required for N-fold cross-validation. Dataset contains one object per line as in svmlight format. By default stratified sampling is used. That is, all folds contain the same number of objects for each label. =head1 OPTIONS =over 6 =item B<-h, --help> Display help information. =item B<-c, --folds> I Set the number of folds. This is a mandatory option. =item B<-d, --output-dir> I Set the output directory. This is a mandatory option. =item B<-r,--random> Use random sampling instead of stratified one. =item B<-s, --seed> I Set the seed value for pseudorandom generator. =back =head1 HOME L =head1 SEE ALSO L L herisvm-0.8.2/scripts/heri-stat000077500000000000000000000167211326043727100165270ustar00rootroot00000000000000#!/usr/bin/env ruby # Copyright (c) 2015 Alexandra Figlovskaya # Copyright (c) 2015 Aleksey Cheusov # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. require 'optparse' @options = {} @err = nil @unspecified_class="__strelka_i_raketa__" def print_pretty(class_name, p, p_comment, r, r_comment, f1, f1_comment) puts "%13s P, R, F1: %-6.4g %-13s, %-6.4g %-13s, %-6.4g" \ % [class_name, p, p_comment, r, r_comment, f1, f1_comment] end def print_accuracy_pretty(a, a_comment) puts "Accuracy : %-6.4g %-13s" % [a, a_comment] end def print_raw(class_name, p, p_comment, r, r_comment, f1, f1_comment) puts "#{class_name}\tprecision\t#{p}\t#{p_comment.strip}" puts "#{class_name}\trecall\t#{r}\t#{r_comment.strip}" puts "#{class_name}\tf1\t#{f1}\t#{f1_comment.strip}" end def print_accuracy_raw(a, a_comment) puts "\taccuracy\t#{a}\t#{a_comment.strip}" end def print_stat(class_name, p, p_comment, r, r_comment, f1, f1_comment) if @options[:raw] print_raw(class_name, p, p_comment, r, r_comment, f1, f1_comment) else print_pretty(class_name, p, p_comment, r, r_comment, f1, f1_comment) end end def print_accuracy(a, a_comment) if @options[:raw] print_accuracy_raw(a, a_comment) else print_accuracy_pretty(a, a_comment) end end def pretty_div(a, b) "%5s/%-5s" % [a, b] end def normalize_tag(tag) tag = tag.to_s.sub(/^[+]/, "") # +1 => 1 if tag =~ /^-?[0-9]+[.][0-9]+$/ tag = tag.sub(/[.]0+$/, "") # -1.0000 => -1 end return tag end def split_into_3(line, fn) line = line.gsub(/\s+/, " ").strip() ret = ["", "", Float::MAX] tokens = line.split(/ /) case tokens.size when 2 ret = [normalize_tag(tokens[0]), normalize_tag(tokens[1]), Float::MAX] when 3 ret = [normalize_tag(tokens[0]), normalize_tag(tokens[1]), tokens[2].to_f] else ret = [normalize_tag(tokens[0]), normalize_tag(tokens[1]), Float::MAX] line.sub!(/^fake ?/, "") STDERR.puts("Bad line '#{line}' in file '#{fn}'") @err = 1 end if ret [2] < @options[:treshold] ret [1] = @unspecified_class end return ret end OptionParser.new do |opts| opts.banner = < heri-stat -1 [OPTIONS] [files...] OPTIONS: EOF opts.on('-h', '--help','display this message and exit') do puts opts exit 0 end @options[:raw] = false opts.on('-R', '--raw','raw tab-separated output') do @options[:raw] = true end @options[:micro_avg] = false opts.on('-m', '--micro-avg','disable micro averaged P/R/F1 output') do @options[:micro_avg] = true end @options[:macro_avg] = false opts.on('-r', '--macro-avg','disable macro averaged P/R/F1 output') do @options[:macro_avg] = true end @options[:statistics] = false opts.on('-c', '--per-class','disable output of per-class statistics') do @options[:statistics] = true end @options[:accuracy] = false opts.on('-a', '--accuracy','disable output of accuracy') do @options[:accuracy] = true end @options[:single] = false opts.on('-1', '--single','obtain both golden and predicted classes from single source. If this option is specified, the first token on input represents the golden class and second one -- predicted class') do @options[:single] = true end @options[:unclassified] = false opts.on("-u", "--unclassified=UNCLASSIFIED", 'set the label for "unclassified" object') do |u| @options[:unclassified] = u.to_s end @options[:treshold] = Float::MIN opts.on("-t", "--treshold=TRESHOLD", 'Minimal treshold for score') do |u| @options[:treshold] = u.to_f @options[:unclassified] = @unspecified_class end opts.separator " " end.parse! if @options[:unclassified] @options[:accuracy]=true else @options[:micro_avg]=true end if @options[:single] golden_tags = [] result_tags = [] while line = gets do gt, rt, fake = split_into_3(line, "") golden_tags << gt result_tags << rt end else golden_tags = IO.read(ARGV[0]).split("\n") result_tags = IO.read(ARGV[1]).split("\n") if golden_tags.length != result_tags.length STDERR.puts("Golden data and predictions should contain the same amount of classes"); exit 1 end golden_tags.each_index do |i| fake1, golden_tags[i], fake = split_into_3("fake " + golden_tags[i], ARGV[0]) fake1, result_tags[i], fake = split_into_3("fake " + result_tags[i], ARGV[1]) end end exit 1 if @err tag2golden_cnt = Hash.new(0) tag2result_cnt = Hash.new(0) tag2TP_cnt = Hash.new(0) all_precision = 0 all_recall = 0 golden_tags.each_index do |i| gt = golden_tags[i] rt = result_tags[i] tag2golden_cnt[gt] += 1 if gt != @options[:unclassified] if rt != @options[:unclassified] tag2result_cnt[rt] += 1 tag2TP_cnt[rt] += (gt == rt ? 1 : 0) end # make sure hash cell exists tag2TP_cnt[gt] += 0 tag2result_cnt[gt] += 0 end all_tp = 0 all_f1 = 0 res_tag2TP_cnt = tag2TP_cnt.sort_by { |key, value| key } res_tag2TP_cnt.each do |t, tp| p = (tag2result_cnt[t] > 0.0 ? tp.to_f / tag2result_cnt[t] : 0.0) r = (tag2golden_cnt[t] > 0.0 ? tp.to_f / tag2golden_cnt[t] : 0.0) f1 = (p+r > 0.0 ? 2*p*r / (p+r) : 0.0) if !@options[:statistics] print_stat("Class %-6s" % [t], p, pretty_div(tp, tag2result_cnt[t]), r, pretty_div(tp, tag2golden_cnt[t]), f1, "") end all_precision += p all_recall += r all_tp += tp all_f1 += f1 end all_rt = 0 tag2result_cnt.each do |tag, rt| all_rt += rt end all_gt = 0 tag2golden_cnt.each do |tag, gt| all_gt += gt end if !@options[:accuracy] accuracy = all_tp.to_f / all_rt.to_f print_accuracy(accuracy, pretty_div(all_tp, all_rt)) end if !@options[:micro_avg] micro_avg_precision = all_tp.to_f / all_rt.to_f micro_avg_recall = all_tp.to_f / all_gt.to_f micro_avg_f1 = 2*micro_avg_precision*micro_avg_recall / (micro_avg_precision+micro_avg_recall) print_stat("Micro average", micro_avg_precision, pretty_div(all_tp, all_rt), micro_avg_recall, pretty_div(all_tp, all_gt), micro_avg_f1, "") end if !@options[:macro_avg] && tag2TP_cnt.size > 0 macro_avg_precision = all_precision / tag2TP_cnt.size macro_avg_recall = all_recall / tag2TP_cnt.size macro_avg_f1 = all_f1 / tag2TP_cnt.size print_stat("Macro average", macro_avg_precision, "", macro_avg_recall, "", macro_avg_f1, "") end herisvm-0.8.2/scripts/heri-stat-addons000077500000000000000000000065701326043727100177760ustar00rootroot00000000000000#!/usr/bin/env ruby # Copyright (c) 2015 Alexandra Figlovskaya # Copyright (c) 2015 Aleksey Cheusov # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # This is an internal herisvm script. It takes output "heri-stat -R" # on input and outputs maximum deviations lines = [] while line = gets do lines << line.split("\t") end module Enumerable def sum return self.inject(0){|accum, i| accum + i } end def mean return self.sum / self.length.to_f end def sample_variance m = self.mean sum = self.inject(0){|accum, i| accum + (i - m) ** 2 } return sum / (self.length - 1).to_f end def standard_deviation return Math.sqrt(self.sample_variance) end end def hash2hash2array return Hash.new do |h,k| h [k] = Hash.new do |h,k| h [k] = [] end end end def hash2hash return Hash.new do |h,k| h [k] = {} end end def print_value_pretty(t, f, value1, value2) if value2 puts "%-13s %-23s: %6s %6s" % [t, f, value1, value2] else puts "%-13s %-23s: %6s" % [t, f, value1] end end def print_value_raw(t, f, value1, value2) exit 5 # not implemented yet end def print_value(t, f, value1, value2) if false #@options[:raw] print_value_raw(t, f, value1, value2) else print_value_pretty(t, f, value1, value2) end end values = hash2hash2array lines.each do |tokens| values [tokens[1]][tokens[0]] << tokens[2].to_f end mi = hash2hash ma = hash2hash max_deviation = hash2hash std_deviation = hash2hash values.each do |key1, hash| hash.each do |key2, arr| mi [key1] [key2] = arr.min ma [key1] [key2] = arr.max max_deviation [key1][key2] = arr.max - arr.min std_deviation [key1][key2] = arr.standard_deviation end end FIELDS = {"precision" => "P", "recall" => "R", "f1" => "F1", "accuracy" => "A"} #FIELDS = {"f1" => "F1"} TYPES = {"" => 1, "Macro average" => 1} FIELDS.each do |f, f_to_print| pairs = [] max_deviation[f].each do |t, max_dev| pairs << [f, t] if max_dev && TYPES.include?(t) end max_deviation[f].each do |t, max_dev| pairs << [f, t] if max_dev && ! TYPES.include?(t) end pairs.each do |ft| max_dev = max_deviation [ft[0]][ft[1]] std_dev = std_deviation [ft[0]][ft[1]] max_dev = "%-5.3g%%" % [max_dev*100] std_dev = "%-5.3g" % [std_dev*100] print_value(ft[1], "max/std deviation(" + f_to_print + ")", max_dev, std_dev) end puts '' end herisvm-0.8.2/scripts/heri-stat.pod000066400000000000000000000031021326043727100172720ustar00rootroot00000000000000=head1 NAME heri-stat - calculates precision, recall, F1 and some other things =head1 SYNOPSIS B [OPTIONS] I I B -1 [OPTIONS] [I...] =head1 DESCRIPTION Unless option B<-1> was applied B reads golden classes from I (one class per line) and predicted classes from I (one class per line) and outputs precision, recall, F1 and some other statistics to stdout. It is allowed for I to contain two tokens per line, the first one is a class and the second one is a score, e.g. probability. If B<-1> was applied, two or three tokens per line are expected on input: golden class, predicted class, and optional score. =head1 OPTIONS =over 6 =item B<-h, --help> Display help information. =item B<-R, --raw> Raw tab-separated output. =item B<-m, --micro-avg> Disable micro averaged P/R/F1 output. =item B<-r, --macro-avg> Disable macro averaged P/R/F1 output. =item B<-c, --per-class> Disable output of per-class statistics. =item B<-a, --accuracy> Disable output of accuracy. =item B<-1, --single> 2 or 3 tokens per line are expected on input. =item B<-u, --unclassified> I