[88849] trunk/dports/math/libsvm/files

hum at macports.org hum at macports.org
Fri Jan 13 05:07:06 PST 2012


Revision: 88849
          http://trac.macports.org/changeset/88849
Author:   hum at macports.org
Date:     2012-01-13 05:07:05 -0800 (Fri, 13 Jan 2012)
Log Message:
-----------
libsvm: add python scripts to be installed with tools variant.

Added Paths:
-----------
    trunk/dports/math/libsvm/files/fselect.py
    trunk/dports/math/libsvm/files/gridregression.py
    trunk/dports/math/libsvm/files/plotroc.py

Added: trunk/dports/math/libsvm/files/fselect.py
===================================================================
--- trunk/dports/math/libsvm/files/fselect.py	                        (rev 0)
+++ trunk/dports/math/libsvm/files/fselect.py	2012-01-13 13:07:05 UTC (rev 88849)
@@ -0,0 +1,566 @@
+#!/usr/bin/env python
+
+import random
+from random import randrange
+import sys
+from time import time
+from datetime import datetime
+#import string
+#from string import *
+import os
+from os import system
+from os import unlink
+from subprocess import *
+
+##### Path Setting #####
+
+is_win32 = (sys.platform == 'win32')
+if not is_win32:
+	gridpy_exe = "./grid.py -log2c -2,9,2 -log2g 1,-11,-2"
+	svmtrain_exe="../svm-train"
+	svmpredict_exe="../svm-predict"
+else:
+	gridpy_exe = r".\grid.py -log2c -2,9,2 -log2g 1,-11,-2"
+	svmtrain_exe=r"..\windows\svmtrain.exe"
+	svmpredict_exe=r"..\windows\svmpredict.exe"
+
+##### Global Variables #####
+
+train_pathfile=""
+train_file=""
+test_pathfile=""
+test_file=""
+if_predict_all=0
+
+whole_fsc_dict={}
+whole_imp_v=[]
+
+
+def arg_process():
+	global train_pathfile, test_pathfile
+	global train_file, test_file
+	global svmtrain_exe, svmpredict_exe
+
+	if len(sys.argv) not in [2,3]:
+		print('Usage: %s training_file [testing_file]' % sys.argv[0])
+		raise SystemExit
+
+	train_pathfile=sys.argv[1]
+	assert os.path.exists(train_pathfile),"training file not found"
+	train_file = os.path.split(train_pathfile)[1]
+
+	if len(sys.argv) == 3:
+		test_pathfile=sys.argv[2]
+		assert os.path.exists(test_pathfile),"testing file not found"
+		test_file = os.path.split(test_pathfile)[1]
+
+
+##### Decide sizes of selected feautures #####
+
+def feat_num_try_half(max_index):
+	v=[]
+	while max_index > 1:
+		v.append(max_index)
+		max_index //= 2
+	return v
+
+def feat_num_try(f_tuple):
+	for i in range(len(f_tuple)):
+		if f_tuple[i][1] < 1e-20:
+			i=i-1; break
+	#only take first eight numbers (>1%)
+	return feat_num_try_half(i+1)[:8]
+
+
+def random_shuffle(label, sample):
+	random.seed(1)  # so that result is the same every time
+	size = len(label)
+	for i in range(size):
+		ri = randrange(0, size-i)
+		tmp = label[ri]
+		label[ri] = label[size-i-1]
+		label[size-i-1] = tmp
+		tmp = sample[ri]
+		sample[ri] = sample[size-i-1]
+		sample[size-i-1] = tmp
+
+
+
+### compare function used in list.sort(): sort by element[1]
+#def value_cmpf(x,y):
+#	if x[1]>y[1]: return -1
+#	if x[1]<y[1]: return 1
+#	return 0
+def value_cmpf(x):
+	return (-x[1]);
+
+### cal importance of features
+### return fscore_dict and feat with desc order
+def cal_feat_imp(label,sample):
+
+	print("calculating fsc...")
+
+	score_dict=cal_Fscore(label,sample)
+
+	score_tuples = list(score_dict.items())
+	score_tuples.sort(key = value_cmpf)
+
+	feat_v = score_tuples
+	for i in range(len(feat_v)): feat_v[i]=score_tuples[i][0]
+
+	print("fsc done")
+	return score_dict,feat_v
+
+
+### select features and return new data
+def select(sample, feat_v):
+	new_samp = []
+
+	feat_v.sort()
+
+	#for each sample
+	for s in sample:
+		point={}
+		#for each feature to select
+		for f in feat_v:
+			if f in s: point[f]=s[f]
+
+		new_samp.append(point)
+
+	return new_samp
+
+
+### Do parameter searching (grid.py) 
+def train_svm(tr_file):
+	cmd = "%s %s" % (gridpy_exe,tr_file)
+	print(cmd)
+	print('Cross validation...')
+	std_out = Popen(cmd, shell = True, stdout = PIPE).stdout
+
+	line = ''
+	while 1:
+		last_line = line
+		line = std_out.readline()
+		if not line: break
+	c,g,rate = map(float,last_line.split())
+
+	print('Best c=%s, g=%s CV rate=%s' % (c,g,rate))
+
+	return c,g,rate
+
+### Given (C,g) and training/testing data,
+### return predicted labels
+def predict(tr_label, tr_sample, c, g, test_label, test_sample, del_model=1, model_name=None):
+	global train_file
+	tr_file = train_file+".tr"
+	te_file = train_file+".te"
+	if model_name:  model_file = model_name
+	else:  model_file = "%s.model"%tr_file
+	out_file = "%s.o"%te_file
+        
+	# train
+	writedata(tr_sample,tr_label,tr_file)
+	cmd = "%s -c %f -g %f %s %s" % (svmtrain_exe,c,g,tr_file,model_file)
+	os.system(cmd) 
+
+	# test
+	writedata(test_sample,test_label,te_file)
+	cmd = "%s %s %s %s" % (svmpredict_exe, te_file,model_file,out_file )
+	print(cmd)
+	os.system(cmd)
+        
+	# fill in pred_y
+	pred_y=[]
+	fp = open(out_file)
+	line = fp.readline()
+	while line:
+		pred_y.append( float(line) )
+		line = fp.readline()
+        
+	rem_file(tr_file)
+	#rem_file("%s.out"%tr_file)
+	#rem_file("%s.png"%tr_file)
+	rem_file(te_file)
+	if del_model: rem_file(model_file)
+	fp.close()
+	rem_file(out_file)
+        
+	return pred_y
+
+
+def cal_acc(pred_y, real_y):
+	right = 0.0
+
+	for i in range(len(pred_y)):
+		if(pred_y[i] == real_y[i]): right += 1
+
+	print("ACC: %d/%d"%(right, len(pred_y)))
+	return right/len(pred_y)
+
+### balanced accuracy
+def cal_bacc(pred_y, real_y):
+	p_right = 0.0
+	n_right = 0.0
+	p_num = 0
+	n_num = 0
+
+	size=len(pred_y)
+	for i in range(size):
+		if real_y[i] == 1:
+			p_num+=1
+			if real_y[i]==pred_y[i]: p_right+=1
+		else:
+			n_num+=1
+			if real_y[i]==pred_y[i]: n_right+=1
+
+	print([p_right,p_num,n_right,n_num])
+	writelog("       p_yes/p_num, n_yes/n_num: %d/%d , %d/%d\n"%(p_right,p_num,n_right,n_num))
+	if p_num==0: p_num=1
+	if n_num==0: n_num=1
+	return 0.5*( p_right/p_num + n_right/n_num )
+
+
+##### Log related #####
+def initlog(name):
+	global logname
+	logname = name
+	logfile_fd = open(logname, 'w')
+	logfile_fd.close()
+
+
+VERBOSE_MAX=100
+VERBOSE_ITER = 3
+VERBOSE_GRID_TIME = 2
+VERBOSE_TIME = 1
+
+def writelog(str, vlevel=VERBOSE_MAX):
+	global logname
+	if vlevel > VERBOSE_ITER:
+		logfile_fd = open(logname, 'a')
+		logfile_fd.write(str)
+		logfile_fd.close()
+
+
+def rem_file(filename):
+	#system("rm -f %s"%filename)
+	unlink(filename)
+
+##### MAIN FUNCTION #####
+def main():
+	global train_pathfile, train_file
+	global test_pathfile, test_file
+	global whole_fsc_dict,whole_imp_v
+
+	times=5 #number of hold-out times
+	accuracy=[]
+
+	### Read Data
+	print("reading....")
+	t=time()
+	train_label, train_sample, max_index = readdata(train_pathfile)
+	t=time()-t
+	writelog("loading data '%s': %.1f sec.\n"%(train_pathfile,t), VERBOSE_TIME)
+	print("read done")
+
+	### Randomly shuffle data
+	random_shuffle(train_label, train_sample)
+
+
+	###calculate f-score of whole training data
+	#whole_imp_v contains feat with order
+	t=time()
+	whole_fsc_dict,whole_imp_v = cal_feat_imp(train_label,train_sample)
+	t=time()-t
+	writelog("cal f-score time: %.1f\n"%t, VERBOSE_TIME)
+
+	###write (sorted) f-score list in another file
+	f_tuples = list(whole_fsc_dict.items())
+	f_tuples.sort(key = value_cmpf)
+	fd = open("%s.fscore"%train_file, 'w')
+	for t in f_tuples:
+		fd.write("%d: \t%.6f\n"%t)
+	fd.close()
+
+
+	### decide sizes of features to try
+	fnum_v = feat_num_try(f_tuples) #ex: [50,25,12,6,3,1]
+	for i in range(len(fnum_v)):
+		accuracy.append([])
+	writelog("try feature sizes: %s\n\n"%(fnum_v))
+
+
+	writelog("%#Feat\test. acc.\n")
+
+	est_acc=[]
+	#for each possible feature subset
+	for j in range(len(fnum_v)):
+
+		fn = fnum_v[j]  # fn is the number of features selected
+		fv = whole_imp_v[:fn] # fv is indices of selected features
+
+		t=time()
+		#pick features
+		tr_sel_samp = select(train_sample, fv)
+		tr_sel_name = train_file+".tr"
+		t=time()-t
+		writelog("\n   feature num: %d\n"%fn, VERBOSE_ITER)
+		writelog("      pick time: %.1f\n"%t, VERBOSE_TIME)
+
+		t=time()
+		writedata(tr_sel_samp,train_label,tr_sel_name)
+		t=time()-t
+		writelog("      write data time: %.1f\n"%t, VERBOSE_TIME)
+
+
+		t=time()
+		# choose best c, gamma from splitted training sample
+		c,g, cv_acc = train_svm(tr_sel_name)
+		t=time()-t
+		writelog("      choosing c,g time: %.1f\n"%t, VERBOSE_GRID_TIME)
+
+		est_acc.append(cv_acc)
+		writelog("%d:\t%.5f\n"%(fnum_v[j],cv_acc) )
+
+	print(fnum_v)
+	print(est_acc)
+
+	fnum=fnum_v[est_acc.index(max(est_acc))]
+#	print(est_acc.index(max(est_acc)))
+	print('Number of selected features %s' % fnum)
+	print('Please see %s.select for details' % train_file)
+
+	#result for features selected
+	sel_fv = whole_imp_v[:fnum]
+
+	writelog("max validation accuarcy: %.6f\n"%max(est_acc))
+	writelog("\nselect features: %s\n"%sel_fv)
+	writelog("%s features\n"%fnum)
+		
+
+	# REMOVE INTERMEDIATE TEMPORARY FILE: training file after selection
+	rem_file(tr_sel_name)
+	rem_file("%s.out"%tr_sel_name)
+	rem_file("%s.png"%tr_sel_name)
+
+
+	### do testing 
+
+	test_label=None
+	test_sample=None
+	if test_pathfile != "":
+		print("reading testing data....")
+		test_label, test_sample, max_index = readdata(test_pathfile)
+		writelog("\nloading testing data '%s'\n"%test_pathfile)
+		print("read done")
+		
+		#picking features
+		train_sel_samp = select(train_sample, sel_fv)
+		test_sel_samp = select(test_sample, sel_fv)
+
+		#grid search
+		train_sel_name = "%s.%d"%(train_file,fnum)
+		writedata(train_sel_samp,train_label,train_sel_name)
+		c,g, cv_acc = train_svm(train_sel_name)
+		writelog("best (c,g)= %s, cv-acc = %.6f\n"%([c,g],cv_acc))
+
+		# REMOVE INTERMEDIATE TEMPORARY FILE: training file after selection
+		rem_file(train_sel_name)
+
+
+		### predict
+		pred_y = predict(train_label, train_sel_samp, c, g, test_label, test_sel_samp, 0, "%s.model"%train_sel_name)
+
+		#calculate accuracy
+		acc = cal_acc(pred_y, test_label)
+		##acc = cal_bacc(pred_y, test_label)
+		writelog("testing accuracy = %.6f\n"%acc)
+
+		#writing predict labels
+		out_name = "%s.%d.pred"%(test_file,fnum)
+		fd = open(out_name, 'w')
+		for y in pred_y: fd.write("%f\n"%y)
+		fd.close()
+		
+
+### predict all possible sets ###
+def predict_all():
+
+	global train_pathfile, train_file
+	global test_pathfile, test_file
+
+	global whole_fsc_dict,whole_imp_v
+
+	train_label, train_sample, max_index = readdata(train_pathfile)
+	test_label, test_sample, m = readdata(test_pathfile)
+
+	random_shuffle(train_label, train_sample)
+
+	###whole_fsc_dict, ordered_feats = cal_feat_imp(train_label,train_sample)
+	ordered_feats = whole_imp_v
+	f_tuples = whole_fsc_dict.items()
+	f_tuples.sort(key = value_cmpf)
+
+	fnum_v = feat_num_try(f_tuples) #ex: [50,25,12,6,3,1]
+
+	writelog("\nTest All %s\n"%fnum_v)
+	for fnum in fnum_v:
+		sel_fv = ordered_feats[:fnum]
+
+		#picking features
+		train_sel_samp = select(train_sample, sel_fv)
+		test_sel_samp = select(test_sample, sel_fv)
+
+		#grid search
+		train_sel_name = "%s.%d"%(train_file,fnum)
+		writedata(train_sel_samp,train_label,train_sel_name)
+		c,g, cv_acc = train_svm(train_sel_name)
+		writelog("best (c,g)= %s, cv-acc = %.6f\n"%([c,g],cv_acc))
+
+		# REMOVE INTERMEDIATE TEMPORARY FILE: training file after selection
+		rem_file(train_sel_name)
+
+		#predict
+		pred_y = predict(train_label, train_sel_samp, c, g, test_label, test_sel_samp)
+
+		#calculate accuracy
+		acc = cal_acc(pred_y, test_label)
+		##acc = cal_bacc(pred_y, test_label)
+		writelog("feat# %d, testing accuracy = %.6f\n"%(fnum,acc))
+
+		#writing predict labels
+		out_name = "%s.%d.pred"%(test_file,fnum)
+		fd = open(out_name, 'w')
+		for y in pred_y: fd.write("%f\n"%y)
+		fd.close()
+
+		del_out_png = 0
+		if del_out_png:
+			rem_file("%s.out"%train_sel_name)
+			rem_file("%s.png"%train_sel_name)
+
+
+###return a dict containing F_j
+def cal_Fscore(labels,samples):
+
+	data_num=float(len(samples))
+	p_num = {} #key: label;  value: data num
+	sum_f = [] #index: feat_idx;  value: sum
+	sum_l_f = {} #dict of lists.  key1: label; index2: feat_idx; value: sum
+	sumq_l_f = {} #dict of lists.  key1: label; index2: feat_idx; value: sum of square
+	F={} #key: feat_idx;  valud: fscore
+	max_idx = -1
+
+	### pass 1: check number of each class and max index of features
+	for p in range(len(samples)): # for every data point
+		label=labels[p]
+		point=samples[p]
+
+		if label in p_num: p_num[label] += 1
+		else: p_num[label] = 1
+
+		for f in point.keys(): # for every feature
+			if f>max_idx: max_idx=f
+	### now p_num and max_idx are set
+
+	### initialize variables
+	sum_f = [0 for i in range(max_idx)]
+	for la in p_num.keys():
+		sum_l_f[la] = [0 for i in range(max_idx)]
+		sumq_l_f[la] = [0 for i in range(max_idx)]
+
+	### pass 2: calculate some stats of data
+	for p in range(len(samples)): # for every data point
+		point=samples[p]
+		label=labels[p]
+		for tuple in point.items(): # for every feature
+			f = tuple[0]-1 # feat index
+			v = tuple[1] # feat value
+			sum_f[f] += v
+			sum_l_f[label][f] += v
+			sumq_l_f[label][f] += v**2
+	### now sum_f, sum_l_f, sumq_l_f are done
+
+	### for each feature, calculate f-score
+	eps = 1e-12
+	for f in range(max_idx):
+		SB = 0
+		for la in p_num.keys():
+			SB += (p_num[la] * (sum_l_f[la][f]/p_num[la] - sum_f[f]/data_num)**2 )
+
+		SW = eps
+		for la in p_num.keys():
+			SW += (sumq_l_f[la][f] - (sum_l_f[la][f]**2)/p_num[la]) 
+
+		F[f+1] = SB / SW
+
+	return F
+
+
+###### svm data IO ######
+
+def readdata(filename):
+	labels=[]
+	samples=[]
+	max_index=0
+	#load training data
+	fp = open(filename)
+	line = fp.readline()
+
+	while line:
+		# added by untitled, allowing data with comments
+		line=line.strip()
+		if line[0]=="#":
+			line = fp.readline()
+			continue
+
+		elems = line.split()
+		sample = {}
+		for e in elems[1:]:
+			points = e.split(":")
+			p0 = int( points[0].strip() )
+			p1 = float( points[1].strip() )
+			sample[p0] = p1
+			if p0 > max_index:
+				max_index = p0
+		labels.append(float(elems[0]))
+		samples.append(sample)
+		line = fp.readline()
+	fp.close()
+
+	return labels,samples,max_index
+
+def writedata(samples,labels,filename):
+	fp=sys.stdout
+	if filename:
+		fp=open(filename,"w")
+
+	num=len(samples)
+	for i in range(num):
+		if labels: 
+			fp.write("%s"%labels[i])
+		else:
+			fp.write("0")
+		kk=list(samples[i].keys())
+		kk.sort()
+		for k in kk:
+			fp.write(" %d:%f"%(k,samples[i][k]))
+		fp.write("\n")
+
+	fp.flush()
+	fp.close()
+
+
+###### PROGRAM ENTRY POINT ######
+
+arg_process()
+
+initlog("%s.select"%train_file)
+writelog("start: %s\n\n"%datetime.now())
+main()
+
+# do testing on all possible feature sets
+if if_predict_all :
+	predict_all()
+
+writelog("\nend: \n%s\n"%datetime.now())
+

Added: trunk/dports/math/libsvm/files/gridregression.py
===================================================================
--- trunk/dports/math/libsvm/files/gridregression.py	                        (rev 0)
+++ trunk/dports/math/libsvm/files/gridregression.py	2012-01-13 13:07:05 UTC (rev 88849)
@@ -0,0 +1,311 @@
+#!/usr/bin/env python
+
+
+
+import os, sys, traceback
+import getpass
+from threading import Thread
+from subprocess import *
+
+if(sys.hexversion < 0x03000000):
+	import Queue
+else:
+	import queue as Queue
+
+
+# svmtrain and gnuplot executable
+
+is_win32 = (sys.platform == 'win32')
+svmtrain_exe = "../svm-train"
+gnuplot_exe = "/usr/bin/gnuplot"
+# example for windows
+# svmtrain_exe = r"c:\tmp\libsvm-2.4\windows\svmtrain.exe"
+# gnuplot_exe = r"c:\tmp\gp373w32\pgnuplot.exe"
+
+# global parameters and their default values
+
+fold = 5
+c_begin, c_end, c_step = -1,  6, 1
+g_begin, g_end, g_step =  0, -8, -1
+p_begin, p_end, p_step =  -8, -1, 1
+global dataset_pathname, dataset_title, pass_through_string
+global out_filename, png_filename
+
+# experimental
+
+ssh_workers = []
+# ssh_workers = ['linux1','linux1','linux2','linux2','linux3', 'linux4', 'linux6','linux7','linux8','linux8','linux9','linux10','linux11','linux12']
+nr_local_worker = 1
+
+# process command line options, set global parameters
+def process_options(argv=sys.argv):
+
+    global fold
+    global c_begin, c_end, c_step
+    global g_begin, g_end, g_step
+    global p_begin, p_end, p_step
+    global dataset_pathname, dataset_title, pass_through_string
+    global svmtrain_exe, gnuplot_exe, gnuplot, out_filename, png_filename
+    
+    usage = """\
+Usage: grid.py [-log2c begin,end,step] [-log2g begin,end,step] [-log2p begin,end,step] [-v fold] 
+[-svmtrain pathname] [-gnuplot pathname] [-out pathname] [-png pathname]
+[additional parameters for svm-train] dataset"""
+
+    if len(argv) < 2:
+        print(usage)
+        sys.exit(1)
+
+    dataset_pathname = argv[-1]
+    dataset_title = os.path.split(dataset_pathname)[1]
+    out_filename = '%s.out' % dataset_title
+    png_filename = '%s.png' % dataset_title
+    pass_through_options = []
+
+    i = 1
+    while i < len(argv) - 1:
+        if argv[i] == "-log2c":
+            i = i + 1
+            (c_begin,c_end,c_step) = map(float,argv[i].split(","))
+        elif argv[i] == "-log2g":
+            i = i + 1
+            (g_begin,g_end,g_step) = map(float,argv[i].split(","))
+        elif argv[i] == "-log2p":
+            i = i + 1
+            (p_begin,p_end,p_step) = map(float,argv[i].split(","))
+        elif argv[i] == "-v":
+            i = i + 1
+            fold = argv[i]
+        elif argv[i] in ('-c','-g'):
+            print("Option -c and -g are renamed.")
+            print(usage)
+            sys.exit(1)
+        elif argv[i] == '-svmtrain':
+            i = i + 1
+            svmtrain_exe = argv[i]
+        elif argv[i] == '-gnuplot':
+            i = i + 1
+            gnuplot_exe = argv[i]
+        elif argv[i] == '-out':
+            i = i + 1
+            out_filename = argv[i]
+        elif argv[i] == '-png':
+            i = i + 1
+            png_filename = argv[i]
+        else:
+            pass_through_options.append(argv[i])
+        i = i + 1
+
+    pass_through_string = " ".join(pass_through_options)
+    assert os.path.exists(svmtrain_exe),"svm-train executable not found"
+    assert os.path.exists(gnuplot_exe),"gnuplot executable not found"
+    assert os.path.exists(dataset_pathname),"dataset not found"
+#    gnuplot = Popen(gnuplot_exe,stdin = PIPE).stdin
+
+
+def range_f(begin,end,step):
+    # like range, but works on non-integer too
+    seq = []
+    while True:
+        if step > 0 and begin > end: break
+        if step < 0 and begin < end: break
+        seq.append(begin)
+        begin = begin + step
+    return seq
+
+def permute_sequence(seq):
+    n = len(seq)
+    if n <= 1: return seq
+
+    mid = int(n/2)
+    left = permute_sequence(seq[:mid])
+    right = permute_sequence(seq[mid+1:])
+
+    ret = [seq[mid]]
+    while left or right:
+        if left: ret.append(left.pop(0))
+        if right: ret.append(right.pop(0))
+
+    return ret
+
+def redraw (db,tofile=0):
+    if len(db) == 0: return
+    begin_level = round(max(map(lambda x: (x[2],db)))) - 3
+    step_size = 0.5
+    if tofile:
+        gnuplot.write(b"set term png transparent small color\n")
+        gnuplot.write(b"set output \"%s\"\n" % png_filename.replace('\\','\\\\'))
+        #gnuplot.write("set term postscript color solid\n".encode())
+        #gnuplot.write(("set output \"%s.ps\"\n" % dataset_title).encode())
+    else:
+        if is_win32:
+            gnuplot.write(b"set term windows\n")
+        else:
+            gnuplot.write(b"set term x11\n")
+    gnuplot.write(b"set xlabel \"lg(C)\"\n")
+    gnuplot.write(b"set ylabel \"lg(gamma)\"\n")
+    gnuplot.write(b"set xrange [%s:%s]\n" % (c_begin,c_end))
+    gnuplot.write(b"set yrange [%s:%s]\n" % (g_begin,g_end))
+    gnuplot.write(b"set contour\n")
+    gnuplot.write(b"set cntrparam levels incremental %s,%s,100\n" % (begin_level,step_size))
+    gnuplot.write(b"set nosurface\n")
+    gnuplot.write(b"set view 0,0\n")
+    gnuplot.write(b"set label \"%s\" at screen 0.4,0.9\n" % dataset_title)
+    gnuplot.write(b"splot \"-\" with lines\n")
+
+
+
+    db.sort(key = lambda x:(x[0], -x[1]))
+
+    prevc = db[0][0]
+    for line in db:
+        if prevc != line[0]:
+            gnuplot.write(b"\n")
+            prevc = line[0]
+        gnuplot.write(b"%s %s %s\n" % line)
+    gnuplot.write(b"e\n")
+    gnuplot.write(b"\n") # force gnuplot back to prompt when term set failure
+    gnuplot.flush()
+
+
+def calculate_jobs():
+    c_seq = permute_sequence(range_f(c_begin,c_end,c_step))
+    g_seq = permute_sequence(range_f(g_begin,g_end,g_step))
+    p_seq = permute_sequence(range_f(p_begin,p_end,p_step))
+    nr_c = len(c_seq)
+    nr_g = len(g_seq)
+    nr_p = len(p_seq)
+    jobs = []
+
+    for i in range(0,nr_c):
+        for j in range(0,nr_g):
+            for s in range(0,nr_p):
+                line = []
+                line.append((c_seq[i],g_seq[j],p_seq[s]))
+                jobs.append(line)
+    return jobs
+
+class WorkerStopToken:  # used to notify the worker to stop
+        pass
+
+class Worker(Thread):
+    def __init__(self,name,job_queue,result_queue):
+        Thread.__init__(self)
+        self.name = name
+        self.job_queue = job_queue
+        self.result_queue = result_queue
+    def run(self):
+        while True:
+            (cexp,gexp,pexp) = self.job_queue.get()
+            if cexp is WorkerStopToken:
+                self.job_queue.put((cexp,gexp,pexp))
+                # print 'worker %s stop.' % self.name
+                break
+            try:
+                rate = self.run_one(2.0**cexp,2.0**gexp,2.0**pexp)
+                if rate is None: raise RuntimeError("get no rate")
+            except:
+                # we failed, let others do that and we just quit
+            
+                traceback.print_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2])
+
+                self.job_queue.put((cexp,gexp,pexp))
+                print('worker %s quit.' % self.name)
+                break
+            else:
+                self.result_queue.put((self.name,cexp,gexp,pexp,rate))
+
+class LocalWorker(Worker):
+    def run_one(self,c,g,p):
+        cmdline = '%s -s 3 -c %s -g %s -p %s -v %s %s %s' % \
+          (svmtrain_exe,c,g,p,fold,pass_through_string,dataset_pathname)
+        result = Popen(cmdline,shell=True,stdout=PIPE).stdout
+        for line in result.readlines():
+            if str(line).find("Cross") != -1:
+                return float(line.split()[-1])
+
+class SSHWorker(Worker):
+    def __init__(self,name,job_queue,result_queue,host):
+        Worker.__init__(self,name,job_queue,result_queue)
+        self.host = host
+        self.cwd = os.getcwd()
+    def run_one(self,c,g,p):
+        cmdline = 'ssh %s "cd %s; %s -s 3 -c %s -g %s -p %s -v %s %s %s"' % \
+          (self.host,self.cwd,
+           svmtrain_exe,c,g,p,fold,pass_through_string,dataset_pathname)
+#	print cmdline
+        result = Popen(cmdline,shell=True,stdout=PIPE).stdout
+        for line in result.readlines():
+            if str(line).find("Cross") != -1:
+                return float(line.split()[-1])
+
+def main():
+
+    # set parameters
+
+    process_options()
+
+    # put jobs in queue
+
+    jobs = calculate_jobs()
+    #print(len(jobs))
+    job_queue = Queue.Queue(0)
+    result_queue = Queue.Queue(0)
+
+    for line in jobs:
+        for (c,g,p) in line:
+            job_queue.put((c,g,p))
+
+    # hack the queue to become a stack --
+    # this is important when some thread
+    # failed and re-put a job. It we still
+    # use FIFO, the job will be put
+    # into the end of the queue, and the graph
+    # will only be updated in the end
+    
+    job_queue._put = job_queue.queue.appendleft
+
+
+    # fire ssh workers
+
+    if ssh_workers:
+        for host in ssh_workers:
+            SSHWorker(host,job_queue,result_queue,host).start()
+
+    # fire local workers
+
+    for i in range(nr_local_worker):
+        LocalWorker('local',job_queue,result_queue).start()
+
+    # gather results
+
+    done_jobs = {}
+
+
+    result_file = open(out_filename,'w')
+
+
+    db = []
+    best_mse = float('+inf')
+
+    for line in jobs:
+        for (c,g,p) in line:
+            while (c,g,p) not in done_jobs:
+                (worker,c1,g1,p1,mse) = result_queue.get()
+                done_jobs[(c1,g1,p1)] = mse
+                result_file.write('%s %s %s %s\n' % (c1,g1,p1,mse))
+                result_file.flush()
+                if mse < best_mse:
+                    best_mse = mse
+                    best_c = 2.0**c1
+                    best_g = 2.0**g1
+                    best_p = 2.0**p1
+                print("[%s] %s %s %s %s (best c=%s, g=%s, p=%s, mse=%s)" % \
+(worker,c1,g1,p1,mse,best_c,best_g,best_p,best_mse))
+
+#            db.append((c,g,r,done_jobs[(c,g,r)]))
+
+    job_queue.put((WorkerStopToken,None,None))
+    print("%s %s %s %s" % (best_c,best_g,best_p,best_mse))
+    
+main()

Added: trunk/dports/math/libsvm/files/plotroc.py
===================================================================
--- trunk/dports/math/libsvm/files/plotroc.py	                        (rev 0)
+++ trunk/dports/math/libsvm/files/plotroc.py	2012-01-13 13:07:05 UTC (rev 88849)
@@ -0,0 +1,219 @@
+#!/usr/bin/env python
+#This tool allow users to plot SVM-prob ROC curve from data
+from svmutil import *
+from sys import argv, platform
+from os import path, popen
+from random import randrange , seed
+from operator import itemgetter
+from time import sleep
+
+#search path for gnuplot executable 
+#be careful on using windows LONG filename, surround it with double quotes.
+#and leading 'r' to make it raw string, otherwise, repeat \\.
+gnuplot_exe_list = [r'"C:\Program Files\gnuplot\pgnuplot.exe"', "/usr/bin/gnuplot","/usr/local/bin/gnuplot"]
+
+
+def get_pos_deci(train_y, train_x, test_y, test_x, param):
+	model = svm_train(train_y, train_x, param)
+	#predict and grab decision value, assure deci>0 for label+,
+	#the positive descision value = val[0]*labels[0]
+	labels = model.get_labels()
+	py, evals, deci = svm_predict(test_y, test_x, model)
+	deci = [labels[0]*val[0] for val in deci]
+	return deci,model
+
+#get_cv_deci(prob_y[], prob_x[], svm_parameter param, nr_fold)
+#input raw attributes, labels, param, cv_fold in decision value building
+#output list of decision value, remember to seed(0)
+def get_cv_deci(prob_y, prob_x, param, nr_fold):
+	if nr_fold == 1 or nr_fold==0:
+		deci,model = get_pos_deci(prob_y, prob_x, prob_y, prob_x, param)
+		return deci
+	deci, model = [], []
+	prob_l = len(prob_y)
+
+	#random permutation by swapping i and j instance
+	for i in range(prob_l):
+		j = randrange(i,prob_l)
+		prob_x[i], prob_x[j] = prob_x[j], prob_x[i]
+		prob_y[i], prob_y[j] = prob_y[j], prob_y[i]
+
+	#cross training : folding
+	for i in range(nr_fold):
+		begin = i * prob_l // nr_fold
+		end = (i + 1) * prob_l // nr_fold
+		train_x = prob_x[:begin] + prob_x[end:]
+		train_y = prob_y[:begin] + prob_y[end:]
+		test_x = prob_x[begin:end]
+		test_y = prob_y[begin:end]
+		subdeci, submdel = get_pos_deci(train_y, train_x, test_y, test_x, param)
+		deci += subdeci
+	return deci
+
+#a simple gnuplot object
+class gnuplot:
+	def __init__(self, term='onscreen'):
+		# -persists leave plot window on screen after gnuplot terminates
+		if platform == 'win32':
+			cmdline = gnuplot_exe
+			self.__dict__['screen_term'] = 'windows'
+		else:
+			cmdline = gnuplot_exe + ' -persist'
+			self.__dict__['screen_term'] = 'x11'
+		self.__dict__['iface'] = popen(cmdline,'w')
+		self.set_term(term)
+
+	def set_term(self, term):
+		if term=='onscreen':
+			self.writeln("set term %s" % self.screen_term)
+		else:
+			#term must be either x.ps or x.png
+			if term.find('.ps')>0:
+				self.writeln("set term postscript eps color 22")
+			elif term.find('.png')>0:
+				self.writeln("set term png")
+			else:
+				print("You must set term to either *.ps or *.png")
+				raise SystemExit
+			self.output = term
+		
+	def writeln(self,cmdline):
+		self.iface.write(cmdline + '\n')
+
+	def __setattr__(self, attr, val):
+		if type(val) == str:
+			self.writeln('set %s \"%s\"' % (attr, val))
+		else:
+			print("Unsupport format:", attr, val)
+			raise SystemExit
+
+	#terminate gnuplot
+	def __del__(self):
+		self.writeln("quit")
+		self.iface.flush()
+		self.iface.close()
+
+	def __repr__(self):
+		return "<gnuplot instance: output=%s>" % term
+
+	#data is a list of [x,y]
+	def plotline(self, data):
+		self.writeln("plot \"-\" notitle with lines linewidth 1")
+		for i in range(len(data)):
+			self.writeln("%f %f" % (data[i][0], data[i][1]))
+			sleep(0) #delay
+		self.writeln("e")
+		if platform=='win32':
+			sleep(3)
+
+#processing argv and set some global variables
+def proc_argv(argv = argv):
+	#print("Usage: %s " % argv[0])
+	#The command line : ./plotroc.py [-v cv_fold | -T testing_file] [libsvm-options] training_file
+	train_file = argv[-1]
+	test_file = None
+	fold = 5
+	options = []
+	i = 1
+	while i < len(argv)-1:
+		if argv[i] == '-T': 
+			test_file = argv[i+1]
+			i += 1
+		elif argv[i] == '-v':
+			fold = int(argv[i+1])
+			i += 1
+		else :
+			options += [argv[i]]
+		i += 1
+
+	return ' '.join(options), fold, train_file, test_file
+
+def plot_roc(deci, label, output, title):
+	#count of postive and negative labels
+	db = []
+	pos, neg = 0, 0 		
+	for i in range(len(label)):
+		if label[i]>0:
+			pos+=1
+		else:	
+			neg+=1
+		db.append([deci[i], label[i]])
+
+	#sorting by decision value
+	db = sorted(db, key=itemgetter(0), reverse=True)
+
+	#calculate ROC 
+	xy_arr = []
+	tp, fp = 0., 0.			#assure float division
+	for i in range(len(db)):
+		if db[i][1]>0:		#positive
+			tp+=1
+		else:
+			fp+=1
+		xy_arr.append([fp/neg,tp/pos])
+
+	#area under curve
+	aoc = 0.			
+	prev_x = 0
+	for x,y in xy_arr:
+		if x != prev_x:
+			aoc += (x - prev_x) * y
+			prev_x = x
+
+	#begin gnuplot
+	if title == None:
+		title = output
+	#also write to file
+	g = gnuplot(output)
+	g.xlabel = "False Positive Rate"
+	g.ylabel = "True Positive Rate"
+	g.title = "ROC curve of %s (AUC = %.4f)" % (title,aoc)
+	g.plotline(xy_arr)
+	#display on screen
+	s = gnuplot('onscreen')
+	s.xlabel = "False Positive Rate"
+	s.ylabel = "True Positive Rate"
+	s.title = "ROC curve of %s (AUC = %.4f)" % (title,aoc)
+	s.plotline(xy_arr)
+
+def check_gnuplot_exe():
+	global gnuplot_exe
+	gnuplot_exe = None
+	for g in gnuplot_exe_list:
+		if path.exists(g.replace('"','')):
+			gnuplot_exe=g
+			break
+	if gnuplot_exe == None:
+		print("You must add correct path of 'gnuplot' into gnuplot_exe_list")
+		raise SystemExit
+
+def main():
+	check_gnuplot_exe()
+	if len(argv) <= 1:
+		print("Usage: %s [-v cv_fold | -T testing_file] [libsvm-options] training_file" % argv[0])
+		raise SystemExit
+	param,fold,train_file,test_file = proc_argv()
+	output_file = path.split(train_file)[1] + '-roc.png'
+	#read data
+	train_y, train_x = svm_read_problem(train_file)
+	if set(train_y) != set([1,-1]):
+		print("ROC is only applicable to binary classes with labels 1, -1")
+		raise SystemExit
+
+	#get decision value, with positive = label+
+	seed(0)	#reset random seed
+	if test_file:		#go with test_file
+		output_title = "%s on %s" % (path.split(test_file)[1], path.split(train_file)[1])
+		test_y, test_x = svm_read_problem(test_file)
+		if set(test_y) != set([1,-1]):
+			print("ROC is only applicable to binary classes with labels 1, -1")
+			raise SystemExit
+		deci,model = get_pos_deci(train_y, train_x, test_y, test_x, param)
+		plot_roc(deci, test_y, output_file, output_title)
+	else:				#single file -> CV
+		output_title = path.split(train_file)[1]
+		deci = get_cv_deci(train_y, train_x, param, fold)
+		plot_roc(deci, train_y, output_file, output_title)
+
+if __name__ == '__main__':
+	main()	
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.macosforge.org/pipermail/macports-changes/attachments/20120113/3a867a77/attachment-0001.html>


More information about the macports-changes mailing list