[88849] trunk/dports/math/libsvm/files
hum at macports.org
hum at macports.org
Fri Jan 13 05:07:06 PST 2012
Revision: 88849
http://trac.macports.org/changeset/88849
Author: hum at macports.org
Date: 2012-01-13 05:07:05 -0800 (Fri, 13 Jan 2012)
Log Message:
-----------
libsvm: add python scripts to be installed with tools variant.
Added Paths:
-----------
trunk/dports/math/libsvm/files/fselect.py
trunk/dports/math/libsvm/files/gridregression.py
trunk/dports/math/libsvm/files/plotroc.py
Added: trunk/dports/math/libsvm/files/fselect.py
===================================================================
--- trunk/dports/math/libsvm/files/fselect.py (rev 0)
+++ trunk/dports/math/libsvm/files/fselect.py 2012-01-13 13:07:05 UTC (rev 88849)
@@ -0,0 +1,566 @@
+#!/usr/bin/env python
+
+import random
+from random import randrange
+import sys
+from time import time
+from datetime import datetime
+#import string
+#from string import *
+import os
+from os import system
+from os import unlink
+from subprocess import *
+
+##### Path Setting #####
+
+is_win32 = (sys.platform == 'win32')
+if not is_win32:
+ gridpy_exe = "./grid.py -log2c -2,9,2 -log2g 1,-11,-2"
+ svmtrain_exe="../svm-train"
+ svmpredict_exe="../svm-predict"
+else:
+ gridpy_exe = r".\grid.py -log2c -2,9,2 -log2g 1,-11,-2"
+ svmtrain_exe=r"..\windows\svmtrain.exe"
+ svmpredict_exe=r"..\windows\svmpredict.exe"
+
+##### Global Variables #####
+
+train_pathfile=""
+train_file=""
+test_pathfile=""
+test_file=""
+if_predict_all=0
+
+whole_fsc_dict={}
+whole_imp_v=[]
+
+
+def arg_process():
+ global train_pathfile, test_pathfile
+ global train_file, test_file
+ global svmtrain_exe, svmpredict_exe
+
+ if len(sys.argv) not in [2,3]:
+ print('Usage: %s training_file [testing_file]' % sys.argv[0])
+ raise SystemExit
+
+ train_pathfile=sys.argv[1]
+ assert os.path.exists(train_pathfile),"training file not found"
+ train_file = os.path.split(train_pathfile)[1]
+
+ if len(sys.argv) == 3:
+ test_pathfile=sys.argv[2]
+ assert os.path.exists(test_pathfile),"testing file not found"
+ test_file = os.path.split(test_pathfile)[1]
+
+
+##### Decide sizes of selected feautures #####
+
+def feat_num_try_half(max_index):
+ v=[]
+ while max_index > 1:
+ v.append(max_index)
+ max_index //= 2
+ return v
+
+def feat_num_try(f_tuple):
+ for i in range(len(f_tuple)):
+ if f_tuple[i][1] < 1e-20:
+ i=i-1; break
+ #only take first eight numbers (>1%)
+ return feat_num_try_half(i+1)[:8]
+
+
+def random_shuffle(label, sample):
+ random.seed(1) # so that result is the same every time
+ size = len(label)
+ for i in range(size):
+ ri = randrange(0, size-i)
+ tmp = label[ri]
+ label[ri] = label[size-i-1]
+ label[size-i-1] = tmp
+ tmp = sample[ri]
+ sample[ri] = sample[size-i-1]
+ sample[size-i-1] = tmp
+
+
+
+### compare function used in list.sort(): sort by element[1]
+#def value_cmpf(x,y):
+# if x[1]>y[1]: return -1
+# if x[1]<y[1]: return 1
+# return 0
+def value_cmpf(x):
+ return (-x[1]);
+
+### cal importance of features
+### return fscore_dict and feat with desc order
+def cal_feat_imp(label,sample):
+
+ print("calculating fsc...")
+
+ score_dict=cal_Fscore(label,sample)
+
+ score_tuples = list(score_dict.items())
+ score_tuples.sort(key = value_cmpf)
+
+ feat_v = score_tuples
+ for i in range(len(feat_v)): feat_v[i]=score_tuples[i][0]
+
+ print("fsc done")
+ return score_dict,feat_v
+
+
+### select features and return new data
+def select(sample, feat_v):
+ new_samp = []
+
+ feat_v.sort()
+
+ #for each sample
+ for s in sample:
+ point={}
+ #for each feature to select
+ for f in feat_v:
+ if f in s: point[f]=s[f]
+
+ new_samp.append(point)
+
+ return new_samp
+
+
+### Do parameter searching (grid.py)
+def train_svm(tr_file):
+ cmd = "%s %s" % (gridpy_exe,tr_file)
+ print(cmd)
+ print('Cross validation...')
+ std_out = Popen(cmd, shell = True, stdout = PIPE).stdout
+
+ line = ''
+ while 1:
+ last_line = line
+ line = std_out.readline()
+ if not line: break
+ c,g,rate = map(float,last_line.split())
+
+ print('Best c=%s, g=%s CV rate=%s' % (c,g,rate))
+
+ return c,g,rate
+
+### Given (C,g) and training/testing data,
+### return predicted labels
+def predict(tr_label, tr_sample, c, g, test_label, test_sample, del_model=1, model_name=None):
+ global train_file
+ tr_file = train_file+".tr"
+ te_file = train_file+".te"
+ if model_name: model_file = model_name
+ else: model_file = "%s.model"%tr_file
+ out_file = "%s.o"%te_file
+
+ # train
+ writedata(tr_sample,tr_label,tr_file)
+ cmd = "%s -c %f -g %f %s %s" % (svmtrain_exe,c,g,tr_file,model_file)
+ os.system(cmd)
+
+ # test
+ writedata(test_sample,test_label,te_file)
+ cmd = "%s %s %s %s" % (svmpredict_exe, te_file,model_file,out_file )
+ print(cmd)
+ os.system(cmd)
+
+ # fill in pred_y
+ pred_y=[]
+ fp = open(out_file)
+ line = fp.readline()
+ while line:
+ pred_y.append( float(line) )
+ line = fp.readline()
+
+ rem_file(tr_file)
+ #rem_file("%s.out"%tr_file)
+ #rem_file("%s.png"%tr_file)
+ rem_file(te_file)
+ if del_model: rem_file(model_file)
+ fp.close()
+ rem_file(out_file)
+
+ return pred_y
+
+
+def cal_acc(pred_y, real_y):
+ right = 0.0
+
+ for i in range(len(pred_y)):
+ if(pred_y[i] == real_y[i]): right += 1
+
+ print("ACC: %d/%d"%(right, len(pred_y)))
+ return right/len(pred_y)
+
+### balanced accuracy
+def cal_bacc(pred_y, real_y):
+ p_right = 0.0
+ n_right = 0.0
+ p_num = 0
+ n_num = 0
+
+ size=len(pred_y)
+ for i in range(size):
+ if real_y[i] == 1:
+ p_num+=1
+ if real_y[i]==pred_y[i]: p_right+=1
+ else:
+ n_num+=1
+ if real_y[i]==pred_y[i]: n_right+=1
+
+ print([p_right,p_num,n_right,n_num])
+ writelog(" p_yes/p_num, n_yes/n_num: %d/%d , %d/%d\n"%(p_right,p_num,n_right,n_num))
+ if p_num==0: p_num=1
+ if n_num==0: n_num=1
+ return 0.5*( p_right/p_num + n_right/n_num )
+
+
+##### Log related #####
+def initlog(name):
+ global logname
+ logname = name
+ logfile_fd = open(logname, 'w')
+ logfile_fd.close()
+
+
+VERBOSE_MAX=100
+VERBOSE_ITER = 3
+VERBOSE_GRID_TIME = 2
+VERBOSE_TIME = 1
+
+def writelog(str, vlevel=VERBOSE_MAX):
+ global logname
+ if vlevel > VERBOSE_ITER:
+ logfile_fd = open(logname, 'a')
+ logfile_fd.write(str)
+ logfile_fd.close()
+
+
+def rem_file(filename):
+ #system("rm -f %s"%filename)
+ unlink(filename)
+
+##### MAIN FUNCTION #####
+def main():
+ global train_pathfile, train_file
+ global test_pathfile, test_file
+ global whole_fsc_dict,whole_imp_v
+
+ times=5 #number of hold-out times
+ accuracy=[]
+
+ ### Read Data
+ print("reading....")
+ t=time()
+ train_label, train_sample, max_index = readdata(train_pathfile)
+ t=time()-t
+ writelog("loading data '%s': %.1f sec.\n"%(train_pathfile,t), VERBOSE_TIME)
+ print("read done")
+
+ ### Randomly shuffle data
+ random_shuffle(train_label, train_sample)
+
+
+ ###calculate f-score of whole training data
+ #whole_imp_v contains feat with order
+ t=time()
+ whole_fsc_dict,whole_imp_v = cal_feat_imp(train_label,train_sample)
+ t=time()-t
+ writelog("cal f-score time: %.1f\n"%t, VERBOSE_TIME)
+
+ ###write (sorted) f-score list in another file
+ f_tuples = list(whole_fsc_dict.items())
+ f_tuples.sort(key = value_cmpf)
+ fd = open("%s.fscore"%train_file, 'w')
+ for t in f_tuples:
+ fd.write("%d: \t%.6f\n"%t)
+ fd.close()
+
+
+ ### decide sizes of features to try
+ fnum_v = feat_num_try(f_tuples) #ex: [50,25,12,6,3,1]
+ for i in range(len(fnum_v)):
+ accuracy.append([])
+ writelog("try feature sizes: %s\n\n"%(fnum_v))
+
+
+ writelog("%#Feat\test. acc.\n")
+
+ est_acc=[]
+ #for each possible feature subset
+ for j in range(len(fnum_v)):
+
+ fn = fnum_v[j] # fn is the number of features selected
+ fv = whole_imp_v[:fn] # fv is indices of selected features
+
+ t=time()
+ #pick features
+ tr_sel_samp = select(train_sample, fv)
+ tr_sel_name = train_file+".tr"
+ t=time()-t
+ writelog("\n feature num: %d\n"%fn, VERBOSE_ITER)
+ writelog(" pick time: %.1f\n"%t, VERBOSE_TIME)
+
+ t=time()
+ writedata(tr_sel_samp,train_label,tr_sel_name)
+ t=time()-t
+ writelog(" write data time: %.1f\n"%t, VERBOSE_TIME)
+
+
+ t=time()
+ # choose best c, gamma from splitted training sample
+ c,g, cv_acc = train_svm(tr_sel_name)
+ t=time()-t
+ writelog(" choosing c,g time: %.1f\n"%t, VERBOSE_GRID_TIME)
+
+ est_acc.append(cv_acc)
+ writelog("%d:\t%.5f\n"%(fnum_v[j],cv_acc) )
+
+ print(fnum_v)
+ print(est_acc)
+
+ fnum=fnum_v[est_acc.index(max(est_acc))]
+# print(est_acc.index(max(est_acc)))
+ print('Number of selected features %s' % fnum)
+ print('Please see %s.select for details' % train_file)
+
+ #result for features selected
+ sel_fv = whole_imp_v[:fnum]
+
+ writelog("max validation accuarcy: %.6f\n"%max(est_acc))
+ writelog("\nselect features: %s\n"%sel_fv)
+ writelog("%s features\n"%fnum)
+
+
+ # REMOVE INTERMEDIATE TEMPORARY FILE: training file after selection
+ rem_file(tr_sel_name)
+ rem_file("%s.out"%tr_sel_name)
+ rem_file("%s.png"%tr_sel_name)
+
+
+ ### do testing
+
+ test_label=None
+ test_sample=None
+ if test_pathfile != "":
+ print("reading testing data....")
+ test_label, test_sample, max_index = readdata(test_pathfile)
+ writelog("\nloading testing data '%s'\n"%test_pathfile)
+ print("read done")
+
+ #picking features
+ train_sel_samp = select(train_sample, sel_fv)
+ test_sel_samp = select(test_sample, sel_fv)
+
+ #grid search
+ train_sel_name = "%s.%d"%(train_file,fnum)
+ writedata(train_sel_samp,train_label,train_sel_name)
+ c,g, cv_acc = train_svm(train_sel_name)
+ writelog("best (c,g)= %s, cv-acc = %.6f\n"%([c,g],cv_acc))
+
+ # REMOVE INTERMEDIATE TEMPORARY FILE: training file after selection
+ rem_file(train_sel_name)
+
+
+ ### predict
+ pred_y = predict(train_label, train_sel_samp, c, g, test_label, test_sel_samp, 0, "%s.model"%train_sel_name)
+
+ #calculate accuracy
+ acc = cal_acc(pred_y, test_label)
+ ##acc = cal_bacc(pred_y, test_label)
+ writelog("testing accuracy = %.6f\n"%acc)
+
+ #writing predict labels
+ out_name = "%s.%d.pred"%(test_file,fnum)
+ fd = open(out_name, 'w')
+ for y in pred_y: fd.write("%f\n"%y)
+ fd.close()
+
+
+### predict all possible sets ###
+def predict_all():
+
+ global train_pathfile, train_file
+ global test_pathfile, test_file
+
+ global whole_fsc_dict,whole_imp_v
+
+ train_label, train_sample, max_index = readdata(train_pathfile)
+ test_label, test_sample, m = readdata(test_pathfile)
+
+ random_shuffle(train_label, train_sample)
+
+ ###whole_fsc_dict, ordered_feats = cal_feat_imp(train_label,train_sample)
+ ordered_feats = whole_imp_v
+ f_tuples = whole_fsc_dict.items()
+ f_tuples.sort(key = value_cmpf)
+
+ fnum_v = feat_num_try(f_tuples) #ex: [50,25,12,6,3,1]
+
+ writelog("\nTest All %s\n"%fnum_v)
+ for fnum in fnum_v:
+ sel_fv = ordered_feats[:fnum]
+
+ #picking features
+ train_sel_samp = select(train_sample, sel_fv)
+ test_sel_samp = select(test_sample, sel_fv)
+
+ #grid search
+ train_sel_name = "%s.%d"%(train_file,fnum)
+ writedata(train_sel_samp,train_label,train_sel_name)
+ c,g, cv_acc = train_svm(train_sel_name)
+ writelog("best (c,g)= %s, cv-acc = %.6f\n"%([c,g],cv_acc))
+
+ # REMOVE INTERMEDIATE TEMPORARY FILE: training file after selection
+ rem_file(train_sel_name)
+
+ #predict
+ pred_y = predict(train_label, train_sel_samp, c, g, test_label, test_sel_samp)
+
+ #calculate accuracy
+ acc = cal_acc(pred_y, test_label)
+ ##acc = cal_bacc(pred_y, test_label)
+ writelog("feat# %d, testing accuracy = %.6f\n"%(fnum,acc))
+
+ #writing predict labels
+ out_name = "%s.%d.pred"%(test_file,fnum)
+ fd = open(out_name, 'w')
+ for y in pred_y: fd.write("%f\n"%y)
+ fd.close()
+
+ del_out_png = 0
+ if del_out_png:
+ rem_file("%s.out"%train_sel_name)
+ rem_file("%s.png"%train_sel_name)
+
+
+###return a dict containing F_j
+def cal_Fscore(labels,samples):
+
+ data_num=float(len(samples))
+ p_num = {} #key: label; value: data num
+ sum_f = [] #index: feat_idx; value: sum
+ sum_l_f = {} #dict of lists. key1: label; index2: feat_idx; value: sum
+ sumq_l_f = {} #dict of lists. key1: label; index2: feat_idx; value: sum of square
+ F={} #key: feat_idx; valud: fscore
+ max_idx = -1
+
+ ### pass 1: check number of each class and max index of features
+ for p in range(len(samples)): # for every data point
+ label=labels[p]
+ point=samples[p]
+
+ if label in p_num: p_num[label] += 1
+ else: p_num[label] = 1
+
+ for f in point.keys(): # for every feature
+ if f>max_idx: max_idx=f
+ ### now p_num and max_idx are set
+
+ ### initialize variables
+ sum_f = [0 for i in range(max_idx)]
+ for la in p_num.keys():
+ sum_l_f[la] = [0 for i in range(max_idx)]
+ sumq_l_f[la] = [0 for i in range(max_idx)]
+
+ ### pass 2: calculate some stats of data
+ for p in range(len(samples)): # for every data point
+ point=samples[p]
+ label=labels[p]
+ for tuple in point.items(): # for every feature
+ f = tuple[0]-1 # feat index
+ v = tuple[1] # feat value
+ sum_f[f] += v
+ sum_l_f[label][f] += v
+ sumq_l_f[label][f] += v**2
+ ### now sum_f, sum_l_f, sumq_l_f are done
+
+ ### for each feature, calculate f-score
+ eps = 1e-12
+ for f in range(max_idx):
+ SB = 0
+ for la in p_num.keys():
+ SB += (p_num[la] * (sum_l_f[la][f]/p_num[la] - sum_f[f]/data_num)**2 )
+
+ SW = eps
+ for la in p_num.keys():
+ SW += (sumq_l_f[la][f] - (sum_l_f[la][f]**2)/p_num[la])
+
+ F[f+1] = SB / SW
+
+ return F
+
+
+###### svm data IO ######
+
+def readdata(filename):
+ labels=[]
+ samples=[]
+ max_index=0
+ #load training data
+ fp = open(filename)
+ line = fp.readline()
+
+ while line:
+ # added by untitled, allowing data with comments
+ line=line.strip()
+ if line[0]=="#":
+ line = fp.readline()
+ continue
+
+ elems = line.split()
+ sample = {}
+ for e in elems[1:]:
+ points = e.split(":")
+ p0 = int( points[0].strip() )
+ p1 = float( points[1].strip() )
+ sample[p0] = p1
+ if p0 > max_index:
+ max_index = p0
+ labels.append(float(elems[0]))
+ samples.append(sample)
+ line = fp.readline()
+ fp.close()
+
+ return labels,samples,max_index
+
+def writedata(samples,labels,filename):
+ fp=sys.stdout
+ if filename:
+ fp=open(filename,"w")
+
+ num=len(samples)
+ for i in range(num):
+ if labels:
+ fp.write("%s"%labels[i])
+ else:
+ fp.write("0")
+ kk=list(samples[i].keys())
+ kk.sort()
+ for k in kk:
+ fp.write(" %d:%f"%(k,samples[i][k]))
+ fp.write("\n")
+
+ fp.flush()
+ fp.close()
+
+
+###### PROGRAM ENTRY POINT ######
+
+arg_process()
+
+initlog("%s.select"%train_file)
+writelog("start: %s\n\n"%datetime.now())
+main()
+
+# do testing on all possible feature sets
+if if_predict_all :
+ predict_all()
+
+writelog("\nend: \n%s\n"%datetime.now())
+
Added: trunk/dports/math/libsvm/files/gridregression.py
===================================================================
--- trunk/dports/math/libsvm/files/gridregression.py (rev 0)
+++ trunk/dports/math/libsvm/files/gridregression.py 2012-01-13 13:07:05 UTC (rev 88849)
@@ -0,0 +1,311 @@
+#!/usr/bin/env python
+
+
+
+import os, sys, traceback
+import getpass
+from threading import Thread
+from subprocess import *
+
+if(sys.hexversion < 0x03000000):
+ import Queue
+else:
+ import queue as Queue
+
+
+# svmtrain and gnuplot executable
+
+is_win32 = (sys.platform == 'win32')
+svmtrain_exe = "../svm-train"
+gnuplot_exe = "/usr/bin/gnuplot"
+# example for windows
+# svmtrain_exe = r"c:\tmp\libsvm-2.4\windows\svmtrain.exe"
+# gnuplot_exe = r"c:\tmp\gp373w32\pgnuplot.exe"
+
+# global parameters and their default values
+
+fold = 5
+c_begin, c_end, c_step = -1, 6, 1
+g_begin, g_end, g_step = 0, -8, -1
+p_begin, p_end, p_step = -8, -1, 1
+global dataset_pathname, dataset_title, pass_through_string
+global out_filename, png_filename
+
+# experimental
+
+ssh_workers = []
+# ssh_workers = ['linux1','linux1','linux2','linux2','linux3', 'linux4', 'linux6','linux7','linux8','linux8','linux9','linux10','linux11','linux12']
+nr_local_worker = 1
+
+# process command line options, set global parameters
+def process_options(argv=sys.argv):
+
+ global fold
+ global c_begin, c_end, c_step
+ global g_begin, g_end, g_step
+ global p_begin, p_end, p_step
+ global dataset_pathname, dataset_title, pass_through_string
+ global svmtrain_exe, gnuplot_exe, gnuplot, out_filename, png_filename
+
+ usage = """\
+Usage: grid.py [-log2c begin,end,step] [-log2g begin,end,step] [-log2p begin,end,step] [-v fold]
+[-svmtrain pathname] [-gnuplot pathname] [-out pathname] [-png pathname]
+[additional parameters for svm-train] dataset"""
+
+ if len(argv) < 2:
+ print(usage)
+ sys.exit(1)
+
+ dataset_pathname = argv[-1]
+ dataset_title = os.path.split(dataset_pathname)[1]
+ out_filename = '%s.out' % dataset_title
+ png_filename = '%s.png' % dataset_title
+ pass_through_options = []
+
+ i = 1
+ while i < len(argv) - 1:
+ if argv[i] == "-log2c":
+ i = i + 1
+ (c_begin,c_end,c_step) = map(float,argv[i].split(","))
+ elif argv[i] == "-log2g":
+ i = i + 1
+ (g_begin,g_end,g_step) = map(float,argv[i].split(","))
+ elif argv[i] == "-log2p":
+ i = i + 1
+ (p_begin,p_end,p_step) = map(float,argv[i].split(","))
+ elif argv[i] == "-v":
+ i = i + 1
+ fold = argv[i]
+ elif argv[i] in ('-c','-g'):
+ print("Option -c and -g are renamed.")
+ print(usage)
+ sys.exit(1)
+ elif argv[i] == '-svmtrain':
+ i = i + 1
+ svmtrain_exe = argv[i]
+ elif argv[i] == '-gnuplot':
+ i = i + 1
+ gnuplot_exe = argv[i]
+ elif argv[i] == '-out':
+ i = i + 1
+ out_filename = argv[i]
+ elif argv[i] == '-png':
+ i = i + 1
+ png_filename = argv[i]
+ else:
+ pass_through_options.append(argv[i])
+ i = i + 1
+
+ pass_through_string = " ".join(pass_through_options)
+ assert os.path.exists(svmtrain_exe),"svm-train executable not found"
+ assert os.path.exists(gnuplot_exe),"gnuplot executable not found"
+ assert os.path.exists(dataset_pathname),"dataset not found"
+# gnuplot = Popen(gnuplot_exe,stdin = PIPE).stdin
+
+
+def range_f(begin,end,step):
+ # like range, but works on non-integer too
+ seq = []
+ while True:
+ if step > 0 and begin > end: break
+ if step < 0 and begin < end: break
+ seq.append(begin)
+ begin = begin + step
+ return seq
+
+def permute_sequence(seq):
+ n = len(seq)
+ if n <= 1: return seq
+
+ mid = int(n/2)
+ left = permute_sequence(seq[:mid])
+ right = permute_sequence(seq[mid+1:])
+
+ ret = [seq[mid]]
+ while left or right:
+ if left: ret.append(left.pop(0))
+ if right: ret.append(right.pop(0))
+
+ return ret
+
+def redraw (db,tofile=0):
+ if len(db) == 0: return
+ begin_level = round(max(map(lambda x: (x[2],db)))) - 3
+ step_size = 0.5
+ if tofile:
+ gnuplot.write(b"set term png transparent small color\n")
+ gnuplot.write(b"set output \"%s\"\n" % png_filename.replace('\\','\\\\'))
+ #gnuplot.write("set term postscript color solid\n".encode())
+ #gnuplot.write(("set output \"%s.ps\"\n" % dataset_title).encode())
+ else:
+ if is_win32:
+ gnuplot.write(b"set term windows\n")
+ else:
+ gnuplot.write(b"set term x11\n")
+ gnuplot.write(b"set xlabel \"lg(C)\"\n")
+ gnuplot.write(b"set ylabel \"lg(gamma)\"\n")
+ gnuplot.write(b"set xrange [%s:%s]\n" % (c_begin,c_end))
+ gnuplot.write(b"set yrange [%s:%s]\n" % (g_begin,g_end))
+ gnuplot.write(b"set contour\n")
+ gnuplot.write(b"set cntrparam levels incremental %s,%s,100\n" % (begin_level,step_size))
+ gnuplot.write(b"set nosurface\n")
+ gnuplot.write(b"set view 0,0\n")
+ gnuplot.write(b"set label \"%s\" at screen 0.4,0.9\n" % dataset_title)
+ gnuplot.write(b"splot \"-\" with lines\n")
+
+
+
+ db.sort(key = lambda x:(x[0], -x[1]))
+
+ prevc = db[0][0]
+ for line in db:
+ if prevc != line[0]:
+ gnuplot.write(b"\n")
+ prevc = line[0]
+ gnuplot.write(b"%s %s %s\n" % line)
+ gnuplot.write(b"e\n")
+ gnuplot.write(b"\n") # force gnuplot back to prompt when term set failure
+ gnuplot.flush()
+
+
+def calculate_jobs():
+ c_seq = permute_sequence(range_f(c_begin,c_end,c_step))
+ g_seq = permute_sequence(range_f(g_begin,g_end,g_step))
+ p_seq = permute_sequence(range_f(p_begin,p_end,p_step))
+ nr_c = len(c_seq)
+ nr_g = len(g_seq)
+ nr_p = len(p_seq)
+ jobs = []
+
+ for i in range(0,nr_c):
+ for j in range(0,nr_g):
+ for s in range(0,nr_p):
+ line = []
+ line.append((c_seq[i],g_seq[j],p_seq[s]))
+ jobs.append(line)
+ return jobs
+
+class WorkerStopToken: # used to notify the worker to stop
+ pass
+
+class Worker(Thread):
+ def __init__(self,name,job_queue,result_queue):
+ Thread.__init__(self)
+ self.name = name
+ self.job_queue = job_queue
+ self.result_queue = result_queue
+ def run(self):
+ while True:
+ (cexp,gexp,pexp) = self.job_queue.get()
+ if cexp is WorkerStopToken:
+ self.job_queue.put((cexp,gexp,pexp))
+ # print 'worker %s stop.' % self.name
+ break
+ try:
+ rate = self.run_one(2.0**cexp,2.0**gexp,2.0**pexp)
+ if rate is None: raise RuntimeError("get no rate")
+ except:
+ # we failed, let others do that and we just quit
+
+ traceback.print_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2])
+
+ self.job_queue.put((cexp,gexp,pexp))
+ print('worker %s quit.' % self.name)
+ break
+ else:
+ self.result_queue.put((self.name,cexp,gexp,pexp,rate))
+
+class LocalWorker(Worker):
+ def run_one(self,c,g,p):
+ cmdline = '%s -s 3 -c %s -g %s -p %s -v %s %s %s' % \
+ (svmtrain_exe,c,g,p,fold,pass_through_string,dataset_pathname)
+ result = Popen(cmdline,shell=True,stdout=PIPE).stdout
+ for line in result.readlines():
+ if str(line).find("Cross") != -1:
+ return float(line.split()[-1])
+
+class SSHWorker(Worker):
+ def __init__(self,name,job_queue,result_queue,host):
+ Worker.__init__(self,name,job_queue,result_queue)
+ self.host = host
+ self.cwd = os.getcwd()
+ def run_one(self,c,g,p):
+ cmdline = 'ssh %s "cd %s; %s -s 3 -c %s -g %s -p %s -v %s %s %s"' % \
+ (self.host,self.cwd,
+ svmtrain_exe,c,g,p,fold,pass_through_string,dataset_pathname)
+# print cmdline
+ result = Popen(cmdline,shell=True,stdout=PIPE).stdout
+ for line in result.readlines():
+ if str(line).find("Cross") != -1:
+ return float(line.split()[-1])
+
+def main():
+
+ # set parameters
+
+ process_options()
+
+ # put jobs in queue
+
+ jobs = calculate_jobs()
+ #print(len(jobs))
+ job_queue = Queue.Queue(0)
+ result_queue = Queue.Queue(0)
+
+ for line in jobs:
+ for (c,g,p) in line:
+ job_queue.put((c,g,p))
+
+ # hack the queue to become a stack --
+ # this is important when some thread
+ # failed and re-put a job. It we still
+ # use FIFO, the job will be put
+ # into the end of the queue, and the graph
+ # will only be updated in the end
+
+ job_queue._put = job_queue.queue.appendleft
+
+
+ # fire ssh workers
+
+ if ssh_workers:
+ for host in ssh_workers:
+ SSHWorker(host,job_queue,result_queue,host).start()
+
+ # fire local workers
+
+ for i in range(nr_local_worker):
+ LocalWorker('local',job_queue,result_queue).start()
+
+ # gather results
+
+ done_jobs = {}
+
+
+ result_file = open(out_filename,'w')
+
+
+ db = []
+ best_mse = float('+inf')
+
+ for line in jobs:
+ for (c,g,p) in line:
+ while (c,g,p) not in done_jobs:
+ (worker,c1,g1,p1,mse) = result_queue.get()
+ done_jobs[(c1,g1,p1)] = mse
+ result_file.write('%s %s %s %s\n' % (c1,g1,p1,mse))
+ result_file.flush()
+ if mse < best_mse:
+ best_mse = mse
+ best_c = 2.0**c1
+ best_g = 2.0**g1
+ best_p = 2.0**p1
+ print("[%s] %s %s %s %s (best c=%s, g=%s, p=%s, mse=%s)" % \
+(worker,c1,g1,p1,mse,best_c,best_g,best_p,best_mse))
+
+# db.append((c,g,r,done_jobs[(c,g,r)]))
+
+ job_queue.put((WorkerStopToken,None,None))
+ print("%s %s %s %s" % (best_c,best_g,best_p,best_mse))
+
+main()
Added: trunk/dports/math/libsvm/files/plotroc.py
===================================================================
--- trunk/dports/math/libsvm/files/plotroc.py (rev 0)
+++ trunk/dports/math/libsvm/files/plotroc.py 2012-01-13 13:07:05 UTC (rev 88849)
@@ -0,0 +1,219 @@
+#!/usr/bin/env python
+#This tool allow users to plot SVM-prob ROC curve from data
+from svmutil import *
+from sys import argv, platform
+from os import path, popen
+from random import randrange , seed
+from operator import itemgetter
+from time import sleep
+
+#search path for gnuplot executable
+#be careful on using windows LONG filename, surround it with double quotes.
+#and leading 'r' to make it raw string, otherwise, repeat \\.
+gnuplot_exe_list = [r'"C:\Program Files\gnuplot\pgnuplot.exe"', "/usr/bin/gnuplot","/usr/local/bin/gnuplot"]
+
+
+def get_pos_deci(train_y, train_x, test_y, test_x, param):
+ model = svm_train(train_y, train_x, param)
+ #predict and grab decision value, assure deci>0 for label+,
+ #the positive descision value = val[0]*labels[0]
+ labels = model.get_labels()
+ py, evals, deci = svm_predict(test_y, test_x, model)
+ deci = [labels[0]*val[0] for val in deci]
+ return deci,model
+
+#get_cv_deci(prob_y[], prob_x[], svm_parameter param, nr_fold)
+#input raw attributes, labels, param, cv_fold in decision value building
+#output list of decision value, remember to seed(0)
+def get_cv_deci(prob_y, prob_x, param, nr_fold):
+ if nr_fold == 1 or nr_fold==0:
+ deci,model = get_pos_deci(prob_y, prob_x, prob_y, prob_x, param)
+ return deci
+ deci, model = [], []
+ prob_l = len(prob_y)
+
+ #random permutation by swapping i and j instance
+ for i in range(prob_l):
+ j = randrange(i,prob_l)
+ prob_x[i], prob_x[j] = prob_x[j], prob_x[i]
+ prob_y[i], prob_y[j] = prob_y[j], prob_y[i]
+
+ #cross training : folding
+ for i in range(nr_fold):
+ begin = i * prob_l // nr_fold
+ end = (i + 1) * prob_l // nr_fold
+ train_x = prob_x[:begin] + prob_x[end:]
+ train_y = prob_y[:begin] + prob_y[end:]
+ test_x = prob_x[begin:end]
+ test_y = prob_y[begin:end]
+ subdeci, submdel = get_pos_deci(train_y, train_x, test_y, test_x, param)
+ deci += subdeci
+ return deci
+
+#a simple gnuplot object
+class gnuplot:
+ def __init__(self, term='onscreen'):
+ # -persists leave plot window on screen after gnuplot terminates
+ if platform == 'win32':
+ cmdline = gnuplot_exe
+ self.__dict__['screen_term'] = 'windows'
+ else:
+ cmdline = gnuplot_exe + ' -persist'
+ self.__dict__['screen_term'] = 'x11'
+ self.__dict__['iface'] = popen(cmdline,'w')
+ self.set_term(term)
+
+ def set_term(self, term):
+ if term=='onscreen':
+ self.writeln("set term %s" % self.screen_term)
+ else:
+ #term must be either x.ps or x.png
+ if term.find('.ps')>0:
+ self.writeln("set term postscript eps color 22")
+ elif term.find('.png')>0:
+ self.writeln("set term png")
+ else:
+ print("You must set term to either *.ps or *.png")
+ raise SystemExit
+ self.output = term
+
+ def writeln(self,cmdline):
+ self.iface.write(cmdline + '\n')
+
+ def __setattr__(self, attr, val):
+ if type(val) == str:
+ self.writeln('set %s \"%s\"' % (attr, val))
+ else:
+ print("Unsupport format:", attr, val)
+ raise SystemExit
+
+ #terminate gnuplot
+ def __del__(self):
+ self.writeln("quit")
+ self.iface.flush()
+ self.iface.close()
+
+ def __repr__(self):
+ return "<gnuplot instance: output=%s>" % term
+
+ #data is a list of [x,y]
+ def plotline(self, data):
+ self.writeln("plot \"-\" notitle with lines linewidth 1")
+ for i in range(len(data)):
+ self.writeln("%f %f" % (data[i][0], data[i][1]))
+ sleep(0) #delay
+ self.writeln("e")
+ if platform=='win32':
+ sleep(3)
+
+#processing argv and set some global variables
+def proc_argv(argv = argv):
+ #print("Usage: %s " % argv[0])
+ #The command line : ./plotroc.py [-v cv_fold | -T testing_file] [libsvm-options] training_file
+ train_file = argv[-1]
+ test_file = None
+ fold = 5
+ options = []
+ i = 1
+ while i < len(argv)-1:
+ if argv[i] == '-T':
+ test_file = argv[i+1]
+ i += 1
+ elif argv[i] == '-v':
+ fold = int(argv[i+1])
+ i += 1
+ else :
+ options += [argv[i]]
+ i += 1
+
+ return ' '.join(options), fold, train_file, test_file
+
+def plot_roc(deci, label, output, title):
+ #count of postive and negative labels
+ db = []
+ pos, neg = 0, 0
+ for i in range(len(label)):
+ if label[i]>0:
+ pos+=1
+ else:
+ neg+=1
+ db.append([deci[i], label[i]])
+
+ #sorting by decision value
+ db = sorted(db, key=itemgetter(0), reverse=True)
+
+ #calculate ROC
+ xy_arr = []
+ tp, fp = 0., 0. #assure float division
+ for i in range(len(db)):
+ if db[i][1]>0: #positive
+ tp+=1
+ else:
+ fp+=1
+ xy_arr.append([fp/neg,tp/pos])
+
+ #area under curve
+ aoc = 0.
+ prev_x = 0
+ for x,y in xy_arr:
+ if x != prev_x:
+ aoc += (x - prev_x) * y
+ prev_x = x
+
+ #begin gnuplot
+ if title == None:
+ title = output
+ #also write to file
+ g = gnuplot(output)
+ g.xlabel = "False Positive Rate"
+ g.ylabel = "True Positive Rate"
+ g.title = "ROC curve of %s (AUC = %.4f)" % (title,aoc)
+ g.plotline(xy_arr)
+ #display on screen
+ s = gnuplot('onscreen')
+ s.xlabel = "False Positive Rate"
+ s.ylabel = "True Positive Rate"
+ s.title = "ROC curve of %s (AUC = %.4f)" % (title,aoc)
+ s.plotline(xy_arr)
+
+def check_gnuplot_exe():
+ global gnuplot_exe
+ gnuplot_exe = None
+ for g in gnuplot_exe_list:
+ if path.exists(g.replace('"','')):
+ gnuplot_exe=g
+ break
+ if gnuplot_exe == None:
+ print("You must add correct path of 'gnuplot' into gnuplot_exe_list")
+ raise SystemExit
+
+def main():
+ check_gnuplot_exe()
+ if len(argv) <= 1:
+ print("Usage: %s [-v cv_fold | -T testing_file] [libsvm-options] training_file" % argv[0])
+ raise SystemExit
+ param,fold,train_file,test_file = proc_argv()
+ output_file = path.split(train_file)[1] + '-roc.png'
+ #read data
+ train_y, train_x = svm_read_problem(train_file)
+ if set(train_y) != set([1,-1]):
+ print("ROC is only applicable to binary classes with labels 1, -1")
+ raise SystemExit
+
+ #get decision value, with positive = label+
+ seed(0) #reset random seed
+ if test_file: #go with test_file
+ output_title = "%s on %s" % (path.split(test_file)[1], path.split(train_file)[1])
+ test_y, test_x = svm_read_problem(test_file)
+ if set(test_y) != set([1,-1]):
+ print("ROC is only applicable to binary classes with labels 1, -1")
+ raise SystemExit
+ deci,model = get_pos_deci(train_y, train_x, test_y, test_x, param)
+ plot_roc(deci, test_y, output_file, output_title)
+ else: #single file -> CV
+ output_title = path.split(train_file)[1]
+ deci = get_cv_deci(train_y, train_x, param, fold)
+ plot_roc(deci, train_y, output_file, output_title)
+
+if __name__ == '__main__':
+ main()
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.macosforge.org/pipermail/macports-changes/attachments/20120113/3a867a77/attachment-0001.html>
More information about the macports-changes
mailing list