pax_global_header00006660000000000000000000000064125607550430014521gustar00rootroot0000000000000052 comment=165f966c4ac20319783c8402d4e341c137f43c16 herisvm-herisvm-0.7.0/000077500000000000000000000000001256075504300146555ustar00rootroot00000000000000herisvm-herisvm-0.7.0/Makefile000066400000000000000000000002341256075504300163140ustar00rootroot00000000000000PROJECTNAME = herisvm SUBPRJ = doc scripts:tests MKC_REQD = 0.28.0 NODEPS = *:test-tests test : all-tests test-tests @: .include herisvm-herisvm-0.7.0/README000066400000000000000000000006051256075504300155360ustar00rootroot00000000000000herisvm project is a collection of simple tools implementing evaluation algorithms for classification (machine learning). In particular heri-eval implements N-fold cross-validation where training and testing is run in parallel. This may be useful if you use multi-CPU computer. Run heri-eval -h, heri-stat -h and heri-split -h for documentation and examples. Also see doc/ subdirectory. herisvm-herisvm-0.7.0/doc/000077500000000000000000000000001256075504300154225ustar00rootroot00000000000000herisvm-herisvm-0.7.0/doc/INSTALL000066400000000000000000000007641256075504300164620ustar00rootroot00000000000000Build time dependencies: -- mk-configure (https://github.com/cheusov/mk-configure) is needed for building and installing the project -- pod2man script Runtime dependencies: -- bash -- ruby>=1.9.3 -- modern awk (gnu awk and nawk are good enough) Examples of how to build # cd herisvm-x.y.z # mkcmake all # mkcmake install or $ cd herisvm-x.y.z $ export PREFIX=/usr MANDIR=/usr/share/man SYSCONFDIR=/etc $ mkcmake all $ mkcmake install DESTDIR=/tmp/destdir herisvm-herisvm-0.7.0/doc/LICENSE000066400000000000000000000021551256075504300164320ustar00rootroot00000000000000Copyright (c) 2015 Alexandra Figlovskaya Copyright (c) 2015 Aleksey Cheusov Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. herisvm-herisvm-0.7.0/doc/Makefile000066400000000000000000000001701256075504300170600ustar00rootroot00000000000000FILES = LICENSE NEWS ../README TODO FILESDIR = ${DOCDIR} DOCDIR ?= ${DATADIR}/doc/herisvm .include herisvm-herisvm-0.7.0/doc/NEWS000066400000000000000000000002331256075504300161170ustar00rootroot00000000000000====================================================================== Version 0.1.0, Sat, 13 Jun 2015 12:53:02 +0300 initial publicly available release herisvm-herisvm-0.7.0/doc/TODO000066400000000000000000000001761256075504300161160ustar00rootroot00000000000000* heri-eval: - Repeated random sub-sampling heri-eval -t 10 -r 60 ... - Alternative formats (crfsuite) for heri-split herisvm-herisvm-0.7.0/scripts/000077500000000000000000000000001256075504300163445ustar00rootroot00000000000000herisvm-herisvm-0.7.0/scripts/Makefile000066400000000000000000000002211256075504300177770ustar00rootroot00000000000000SCRIPTS = heri-eval heri-split heri-stat heri-stat-addons MAN = heri-eval.1 heri-split.1 heri-stat.1 CLEANFILES = ${MAN} .include herisvm-herisvm-0.7.0/scripts/heri-eval000077500000000000000000000216621256075504300201550ustar00rootroot00000000000000#!/usr/bin/env bash # Copyright (c) 2015 Alexandra Figlovskaya # Copyright (c) 2015 Aleksey Cheusov # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # variables settable by user : ${SVM_TRAIN_CMD:=svm-train} : ${SVM_PREDICT_CMD:=svm-predict} : ${TMPDIR:=/tmp} ############################################################ set -e export LC_ALL=C indent2 (){ sed '/./ s/^/ /' "$@" } sig_handler (){ on_exit trap - "$1" kill -"$1" $$ } on_exit(){ show_stderr if test -z "$keep_tmp"; then if test -n "$tmp_dir"; then rm -rf "$tmp_dir" fi else echo "Temporary files are here $tmp_dir" 1>&2 fi } results_from_testing_sets (){ if ! test -s "$tmp_dir/testing_fold.txt"; then cat "$tmp_dir/result_single1.txt" return fi awk ' FNR == NR { # reading testing_fold.txt ++obj_num[$1] testobj[$1,obj_num[$1]] = NR next } # reading results on testing folds FNR == 1 { ++fold_num } { idx = testobj[fold_num, FNR] result [idx] = $0 } END { if ((NR % 2) != 0){ print "internal error!" > "/dev/stderr" exit 12 } count = NR/2 for (i=1; i <= count; ++i){ print result [i] } }' "$tmp_dir/testing_fold.txt" $result_all } show_stderr (){ if test -z "$last"; then return fi for i in `seq $last`; do # fn="$tmp_dir/train_stderr${i}" if test -s "$fn"; then echo "---- train stderr $i ----" 1>&2 cat -- "$fn" 1>&2 fi # fn="$tmp_dir/predict_stderr${i}" if test -s "$fn"; then echo "---- predict stderr $i ----" 1>&2 cat -- "$fn" 1>&2 fi done } wait_all (){ local i local ex ex=0 for i in `seq $last`; do if wait ${pid[$i]}; then : else ex=$? fi done return "$ex" } # heri-eval -t10 -n 5 dataset.libsvm # 10*5-fold cross-validation usage(){ cat 1>&2 <<'EOF' usage: heri-eval [OPTIONS] training_set [-- SVM_TRAIN_OPTIONS] Examples: heri-eval -n5 dataset.libsvm # 5-fold cross-validation heri-eval -e testing.libsvm dataset.libsvm # testing on testing.libsvm OPTIONS: -h help message -n N N-fold cross validation mode (mandatory option) -t T T*N-fold cross validation mode (1 by default) -e testing_set testing set for hold-out method -o save results from testing sets to the specified file (golden_tag result_tag [score]) -O save incorrectly classified objects to the specified file (#object_number: golden_tag result_tag [score]) -m save confusion matrix to the specified file (frequency : golden_tag result_tag) -f Enable output of per-fold statistics (see -Mf) -M output mode: t -- output total statistics, f -- output per-fold statistics, c -- output cross-fold statistics. The default is "-M tc". -p options passed to heri-stat(1) -S seed pseudo-random generator used for splitting dataset into traing and testing parts. The default is empty, which means 'split dataset randomly every invocation' -K keep temporary directory after exiting -D debugging mode, implies -K SVM_TRAIN_OPTIONS: options passed to svm-train(1) and alike Environment variables: SVM_TRAIN_CMD -- training utility, e.g., liblinear-train (the default is svm-train) SVM_PREDICT_CMD -- predicting utility, e.g., liblinear-predict (the default is svm-predict) TMPDIR -- temporary directory (the default is /tmp) Examples: Ex1: heri-eval -e testing_set.libsvm training_set.libsvm -- -s 0 -t 0 Ex1: export SVM_TRAIN_CMD='liblinear-train' export SVM_PREDICT_CMD='liblinear-predict' heri-eval -p '-mr' -v 5 training_set.libsvm -- -s 4 -q EOF } runs=1 output_mode=tc times=1 while getopts De:fhKm:M:n:o:O:p:S:t: f; do case "$f" in '?') usage exit 1;; h) usage exit 0;; n) number_of_folds="$OPTARG";; e) testing_set="$OPTARG";; t) times="$OPTARG";; m) confusion_matrix="$OPTARG";; o) results="$OPTARG";; O) incorrect_results="$OPTARG";; p) heristat_args="$heristat_args $OPTARG";; f) output_mode="f$output_mode";; M) output_mode="$OPTARG";; S) seed="$OPTARG";; K) keep_tmp=1;; D) keep_tmp=1 debug=1;; esac done shift `expr $OPTIND - 1` while test "$#" -gt 0; do case "$1" in --) shift break;; *) print_sh=`printf '%q' "$1"` files="$files $print_sh" shift;; esac done trap "sig_handler INT" INT trap "on_exit" 0 if test -z "$number_of_folds" -a -z "$testing_set"; then echo 'Either -v or -e must be specified, run heri-eval -h for details' 1>&2 exit 1 fi if test -z "$files"; then echo 'Training set is mandatory, run heri-eval -h for details' 1>&2 exit 1 fi tmp_dir=`mktemp -d $TMPDIR/svm.XXXXXX` training_testing (){ if test -n "$number_of_folds"; then heri-split -c "$number_of_folds" -d "$tmp_dir" -s "$seed" $files if test -n "$seed"; then seed="${seed}9876" fi last="$number_of_folds" else eval "cat -- $files" > "$tmp_dir/train1.txt" cp "$testing_set" "$tmp_dir/test1.txt" last=1 fi for i in `seq $last`; do ${SVM_TRAIN_CMD} "$@" "$tmp_dir/train$i.txt" "$tmp_dir/svm$i.bin" \ 2> "$tmp_dir/train_stderr${i}" \ > "$tmp_dir/train_stdout${i}" & pid[$i]=$! done wait_all for i in `seq $last`; do ${SVM_PREDICT_CMD} "$tmp_dir/test$i.txt" "$tmp_dir/svm$i.bin" \ "$tmp_dir/result${i}.txt" \ 2> "$tmp_dir/predict_stderr${i}" \ > "$tmp_dir/predict_stdout${i}" & pid[$i]=$! done wait_all rm -f "$tmp_dir/golden_tags" "$tmp_dir/result.txt" } show_stat (){ for t in `seq $times`; do result_all='' for i in `seq $last`; do awk '{print $1}' "$tmp_dir/test${t}_$i.txt" > "$tmp_dir/golden_tags${t}_${i}" if [[ "_$output_mode" =~ f ]]; then echo "Fold ${t}x$i statistics" heri-stat $heristat_args \ "$tmp_dir/golden_tags${t}_${i}" "$tmp_dir/result${t}_${i}.txt" | indent2 echo '' fi heri-stat -R "$tmp_dir/golden_tags${t}_${i}" "$tmp_dir/result${t}_${i}.txt" \ > "$tmp_dir/evaluation${t}_${i}.txt" paste "$tmp_dir/golden_tags${t}_${i}" "$tmp_dir/result${t}_${i}.txt" | \ tr ' ' ' ' > "$tmp_dir/result_single${t}_${i}.txt" ln -f "$tmp_dir/result_single${t}_${i}.txt" "$tmp_dir/result_single${i}.txt" result_all="$result_all $tmp_dir/result_single${i}.txt" done done } for t in `seq $times`; do training_testing "$@" # ls -l "$tmp_dir/" for i in `seq $last`; do ln "$tmp_dir/test${i}.txt" "$tmp_dir/test${t}_$i.txt" ln "$tmp_dir/result${i}.txt" "$tmp_dir/result${t}_${i}.txt" done # rm "$tmp_dir/test${i}.txt" "$tmp_dir/result${i}.txt" done #echo before test #ls -l "$tmp_dir" show_stat #echo after test results_from_testing_sets > "$tmp_dir/result.txt" # -o if test -n "$results"; then cp "$tmp_dir/result.txt" "$results" fi # -O if test -n "$incorrect_results"; then awk '$1 != $2 {print "#" NR, $0}' "$tmp_dir/result.txt" \ > "$incorrect_results" fi # -m if test -n "$confusion_matrix"; then awk '$1 != $2' "$tmp_dir/result.txt" | sort | uniq -c | sort -rn | awk '{print $1, ":", $2, $3}' > "$confusion_matrix" fi # if [[ "_$output_mode" =~ t ]]; then echo 'Total statistics' heri-stat -1 $heristat_args "$tmp_dir"/result_single*_*.txt | indent2 echo '' fi if test -n "$number_of_folds" && [[ "_$output_mode" =~ c ]]; then echo 'Total cross-folds statistics' heri-stat-addons "$tmp_dir"/evaluation*.txt | indent2 fi herisvm-herisvm-0.7.0/scripts/heri-eval.pod000066400000000000000000000037351256075504300207340ustar00rootroot00000000000000=head1 NAME heri-eval - evaluate classification algorithm =head1 SYNOPSIS B [OPTIONS] I [-- SVM_TRAIN_OPTIONS] =head1 DESCRIPTION B runs training algorithm on I and then evaluate it using testing set, specified by option I<-e>. Alternatively, cross-validation is run, if option I<-n> was applied. If cross-validation is used, training and testing on different folds are run in parallel, thus utilizing available CPUs. =head1 OPTIONS =over 6 =item B<-h, --help> Display help information. =item B<-f> Enable output of per-fold statistics. See B<-M>I. =item B<-n> I I-fold cross validation mode (mandatory option). =item B<-t> I I*N-fold cross validation mode (1 by default). =item B<-e> I Sets the testing dataset. =item B<-o> I Save results from testing sets to the specified file. Format: golden_class result_class [score] =item B<-O> I Save incorrectly classified objects to the specified file. Format: #object_number: golden_class result_class [score]) =item B<-m> I Save confusion matrix to the specified file. Format: frequency : golden_class result_class =item B<-p> I Pass the specified I to B =item B<-M> I Sets the output mode where chars are: t -- output total statistics, f -- output per-fold statistics, c -- output cross-fold statistics. The default is "-M tc". =item B<-S> I Pass the specified I to B. =item B<-K> Keep temporary directory after exiting. =item B<-D> Turn on the debugging mode, implies -K. =back =head1 ENVIRONMENT =over 6 =item I Training utility, e.g., liblinear-train (the default is svm-train). =item I Predicting utility, e.g., liblinear-predict (the default is svm-predict). =item I Temporary directory (the default is /tmp). =back =head1 HOME L =head1 SEE ALSO L L herisvm-herisvm-0.7.0/scripts/heri-split000077500000000000000000000070061256075504300203550ustar00rootroot00000000000000#!/usr/bin/env ruby # Copyright (c) 2015 Alexandra Figlovskaya # Copyright (c) 2015 Aleksey Cheusov # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. require 'optparse' options = {} fold_cnt = nil tmp_dir = nil seed = Random.new_seed OptionParser.new do |opts| opts.banner = < [OPTIONS] I [I...] =head1 DESCRIPTION B splits the dataset into several training and testing sets as it is required for N-fold cross-validation. Dataset contains one object per line as in svmlight/libsvm formats. =head1 OPTIONS =over 6 =item B<-h, --help> Display help information. =item B<-c, --folds> I Sets a number if folds. This is a mandatory option. =item B<-d, --output-dir> I Sets the output directory. This is a mandatory option. =item B<-s, --seed> I Sets the seed value for pseudorandom generator. =back =head1 HOME L =head1 SEE ALSO L L herisvm-herisvm-0.7.0/scripts/heri-stat000077500000000000000000000166011256075504300201760ustar00rootroot00000000000000#!/usr/bin/env ruby # Copyright (c) 2015 Alexandra Figlovskaya # Copyright (c) 2015 Aleksey Cheusov # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. require 'optparse' @options = {} @err = nil @unspecified_class="__strelka_i_raketa__" def print_pretty(class_name, p, p_comment, r, r_comment, f1, f1_comment) puts "%13s P, R, F1: %-6.4g %-13s, %-6.4g %-13s, %-6.4g" \ % [class_name, p, p_comment, r, r_comment, f1, f1_comment] end def print_accuracy_pretty(a, a_comment) puts "Accuracy : %-6.4g %-13s" % [a, a_comment] end def print_raw(class_name, p, p_comment, r, r_comment, f1, f1_comment) puts "#{class_name}\tprecision\t#{p}\t#{p_comment.strip}" puts "#{class_name}\trecall\t#{r}\t#{r_comment.strip}" puts "#{class_name}\tf1\t#{f1}\t#{f1_comment.strip}" end def print_accuracy_raw(a, a_comment) puts "\taccuracy\t#{a}\t#{a_comment.strip}" end def print_stat(class_name, p, p_comment, r, r_comment, f1, f1_comment) if @options[:raw] print_raw(class_name, p, p_comment, r, r_comment, f1, f1_comment) else print_pretty(class_name, p, p_comment, r, r_comment, f1, f1_comment) end end def print_accuracy(a, a_comment) if @options[:raw] print_accuracy_raw(a, a_comment) else print_accuracy_pretty(a, a_comment) end end def pretty_div(a, b) "%5s/%-5s" % [a, b] end def normalize_tag(tag) tag = tag.sub(/^[+]/, "") # +1 => 1 if tag =~ /^-?[0-9]+[.][0-9]+$/ tag = tag.sub(/[.]0+$/, "") # -1.0000 => -1 end return tag end def split_into_3(line, fn) line = line.gsub(/\s+/, " ").strip() ret = ["", "", Float::MAX] tokens = line.split(/ /) case tokens.size when 2 ret = [normalize_tag(tokens[0]), normalize_tag(tokens[1]), Float::MAX] when 3 ret = [normalize_tag(tokens[0]), normalize_tag(tokens[1]), tokens[2].to_f] else line.sub!(/^fake ?/, "") STDERR.puts("Bad line '#{line}' in file '#{fn}'") @err = 1 end if ret [2] < @options[:treshold] ret [1] = @unspecified_class end return ret end OptionParser.new do |opts| opts.banner = < heri-stat -1 [OPTIONS] [files...] OPTIONS: EOF opts.on('-h', '--help','display this message and exit') do puts opts exit 0 end @options[:raw] = false opts.on('-R', '--raw','raw tab-separated output') do @options[:raw] = true end @options[:micro_avg] = false opts.on('-m', '--micro-avg','disable micro averaged P/R/F1 output') do @options[:micro_avg] = true end @options[:macro_avg] = false opts.on('-r', '--macro-avg','disable macro averaged P/R/F1 output') do @options[:macro_avg] = true end @options[:statistics] = false opts.on('-c', '--per-class','disable output of per-class statistics') do @options[:statistics] = true end @options[:accuracy] = false opts.on('-a', '--accuracy','disable output of accuracy') do @options[:accuracy] = true end @options[:single] = false opts.on('-1', '--single','obtain both golden and predicted classes from single source. If this option is specified, the first token on input represents the golden class and second one -- predicted class') do @options[:single] = true end @options[:unclassified] = false opts.on("-u", "--unclassified=UNCLASSIFIED", 'set the label for "unclassified" object') do |u| @options[:unclassified] = u.to_s end @options[:treshold] = Float::MIN opts.on("-t", "--treshold=TRESHOLD", 'Minimal treshold for score') do |u| @options[:treshold] = u.to_f @options[:unclassified] = @unspecified_class end opts.separator " " end.parse! if @options[:unclassified] @options[:accuracy]=true else @options[:micro_avg]=true end if @options[:single] golden_tags = [] result_tags = [] while line = gets do gt, rt, fake = split_into_3(line, "") golden_tags << gt result_tags << rt end else golden_tags = IO.read(ARGV[0]).split("\n") result_tags = IO.read(ARGV[1]).split("\n") if golden_tags.length != result_tags.length STDERR.puts("Golden data and predictions should contain the same amount of classes"); exit 1 end golden_tags.each_index do |i| fake1, golden_tags[i], fake = split_into_3("fake " + golden_tags[i], ARGV[0]) fake1, result_tags[i], fake = split_into_3("fake " + result_tags[i], ARGV[1]) end end exit 1 if @err tag2golden_cnt = Hash.new(0) tag2result_cnt = Hash.new(0) tag2TP_cnt = Hash.new(0) all_precision = 0 all_recall = 0 golden_tags.each_index do |i| gt = golden_tags[i] rt = result_tags[i] tag2golden_cnt[gt] += 1 if gt != @options[:unclassified] if rt != @options[:unclassified] tag2result_cnt[rt] += 1 tag2TP_cnt[rt] += (gt == rt ? 1 : 0) end # make sure hash cell exists tag2TP_cnt[gt] += 0 tag2result_cnt[gt] += 0 end all_tp = 0 all_f1 = 0 res_tag2TP_cnt = tag2TP_cnt.sort_by { |key, value| key } res_tag2TP_cnt.each do |t, tp| p = (tag2result_cnt[t] > 0.0 ? tp.to_f / tag2result_cnt[t] : 1.0) r = (tag2golden_cnt[t] > 0.0 ? tp.to_f / tag2golden_cnt[t] : 0.0) f1 = (p+r > 0.0 ? 2*p*r / (p+r) : 0.0) if !@options[:statistics] print_stat("Class %-6s" % [t], p, pretty_div(tp, tag2result_cnt[t]), r, pretty_div(tp, tag2golden_cnt[t]), f1, "") end all_precision += p all_recall += r all_tp += tp all_f1 += f1 end all_rt = 0 tag2result_cnt.each do |tag, rt| all_rt += rt end all_gt = 0 tag2golden_cnt.each do |tag, gt| all_gt += gt end if !@options[:accuracy] accuracy = all_tp.to_f / all_rt.to_f print_accuracy(accuracy, pretty_div(all_tp, all_rt)) end if !@options[:micro_avg] micro_avg_precision = all_tp.to_f / all_rt.to_f micro_avg_recall = all_tp.to_f / all_gt.to_f micro_avg_f1 = 2*micro_avg_precision*micro_avg_recall / (micro_avg_precision+micro_avg_recall) print_stat("Micro average", micro_avg_precision, pretty_div(all_tp, all_rt), micro_avg_recall, pretty_div(all_tp, all_gt), micro_avg_f1, "") end if !@options[:macro_avg] && tag2TP_cnt.size > 0 macro_avg_precision = all_precision / tag2TP_cnt.size macro_avg_recall = all_recall / tag2TP_cnt.size macro_avg_f1 = all_f1 / tag2TP_cnt.size print_stat("Macro average", macro_avg_precision, "", macro_avg_recall, "", macro_avg_f1, "") end herisvm-herisvm-0.7.0/scripts/heri-stat-addons000077500000000000000000000065671256075504300214560ustar00rootroot00000000000000#!/usr/bin/env ruby # Copyright (c) 2015 Alexandra Figlovskaya # Copyright (c) 2015 Aleksey Cheusov # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # This is an internal herisvm script. It takes output "heri-stat -R" # on input and outputs maximum deviations lines = [] while line = gets do lines << line.split("\t") end module Enumerable def sum return self.inject(0){|accum, i| accum + i } end def mean return self.sum / self.length.to_f end def sample_variance m = self.mean sum = self.inject(0){|accum, i| accum + (i - m) ** 2 } return sum / (self.length - 1).to_f end def standard_deviation return Math.sqrt(self.sample_variance) end end def hash2hash2array return Hash.new do |h,k| h [k] = Hash.new do |h,k| h [k] = [] end end end def hash2hash return Hash.new do |h,k| h [k] = {} end end def print_value_pretty(t, f, value1, value2) if value2 puts "%-13s %-23s: %6s %6s" % [t, f, value1, value2] else puts "%-13s %-23s: %6s" % [t, f, value1] end end def print_value_raw(t, f, value1, value2) exit 5 # not implemented yet end def print_value(t, f, value1, value2) if false #@options[:raw] print_value_raw(t, f, value1, value2) else print_value_pretty(t, f, value1, value2) end end values = hash2hash2array lines.each do |tokens| values [tokens[1]][tokens[0]] << tokens[2].to_f end mi = hash2hash ma = hash2hash max_deviation = hash2hash std_deviation = hash2hash values.each do |key1, hash| hash.each do |key2, arr| mi [key1] [key2] = arr.min ma [key1] [key2] = arr.max max_deviation [key1][key2] = arr.max - arr.min std_deviation [key1][key2] = arr.standard_deviation end end FIELDS = {"precision" => "P", "recall" => "R", "f1" => "F1", "accuracy" => "A"} #FIELDS = {"f1" => "F1"} TYPES = {"" => 1, "Macro average" => 1} FIELDS.each do |f, f_to_print| pairs = [] max_deviation[f].each do |t, max_dev| pairs << [f, t] if max_dev && TYPES.include?(t) end max_deviation[f].each do |t, max_dev| pairs << [f, t] if max_dev && ! TYPES.include?(t) end pairs.each do |ft| max_dev = max_deviation [ft[0]][ft[1]] std_dev = std_deviation [ft[0]][ft[1]] max_dev = "%-5.3g%" % [max_dev*100] std_dev = "%-5.3g" % [std_dev*100] print_value(ft[1], "max/std deviation(" + f_to_print + ")", max_dev, std_dev) end puts '' end herisvm-herisvm-0.7.0/scripts/heri-stat.pod000066400000000000000000000031501256075504300207470ustar00rootroot00000000000000=head1 NAME heri-stat - calculates precision, recall, F1 and some other things for given golden data and predictions. =head1 SYNOPSIS B [OPTIONS] I I B -1 [OPTIONS] [I...] =head1 DESCRIPTION Unless option B<-1> was applied B reads golden classes from I (one class per line) and predicted classes from I (one class per line) and outputs precision, recall, F1 and some other statistics to stdout. It is allowed for I to contain two tokens per line, the first one is a class and the second one is a score, e.g. probability. If B<-1> was applied, two or three tokens per line are expected on input: golden class, predicted class, and optional score. =head1 OPTIONS =over 6 =item B<-h, --help> Display help information. =item B<-R, --raw> Raw tab-separated output. =item B<-m, --micro-avg> Disable micro averaged P/R/F1 output. =item B<-r, --macro-avg> Disable macro averaged P/R/F1 output. =item B<-c, --per-class> Disable output of per-class statistics. =item B<-a, --accuracy> Disable output of accuracy. =item B<-1, --single> 2 or 3 tokens per line are expected on input =item B<-u, --unclassified> I