[117348] trunk/dports/textproc
hum at macports.org
hum at macports.org
Sun Feb 23 06:36:18 PST 2014
Revision: 117348
https://trac.macports.org/changeset/117348
Author: hum at macports.org
Date: 2014-02-23 06:36:18 -0800 (Sun, 23 Feb 2014)
Log Message:
-----------
New port: word2vec @20131218 - Tool for computing continuous distributed representations of words.
Added Paths:
-----------
trunk/dports/textproc/word2vec/
trunk/dports/textproc/word2vec/Portfile
trunk/dports/textproc/word2vec/files/
trunk/dports/textproc/word2vec/files/patch-compute-accuracy.c.diff
trunk/dports/textproc/word2vec/files/patch-demo.diff
trunk/dports/textproc/word2vec/files/patch-malloc.diff
Added: trunk/dports/textproc/word2vec/Portfile
===================================================================
--- trunk/dports/textproc/word2vec/Portfile (rev 0)
+++ trunk/dports/textproc/word2vec/Portfile 2014-02-23 14:36:18 UTC (rev 117348)
@@ -0,0 +1,65 @@
+# -*- coding: utf-8; mode: tcl; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- vim:fenc=utf-8:ft=tcl:et:sw=4:ts=4:sts=4
+# $Id$
+
+PortSystem 1.0
+
+name word2vec
+version 20131218
+categories textproc
+maintainers hum openmaintainer
+
+description Tool for computing continuous distributed representations of words
+
+long_description This tool provides an efficient implementation of the \
+ continuous bag-of-words and skip-gram architectures for \
+ computing vector representations of words. These \
+ representations can be subsequently used in many natural \
+ language processing applications and for further research.
+
+homepage https://code.google.com/p/word2vec/
+platforms darwin
+license Apache-2
+
+fetch.type svn
+svn.url http://word2vec.googlecode.com/svn/trunk
+svn.revision 37
+worksrcdir trunk
+
+patchfiles patch-malloc.diff \
+ patch-compute-accuracy.c.diff \
+ patch-demo.diff
+
+use_configure no
+variant universal {}
+
+configure.optflags -O2
+
+build.args CC="${configure.cc}" \
+ CFLAGS="${configure.cflags} [get_canonical_archflags] -lm -pthread -Wall -funroll-loops -Wunused-result"
+
+destroot {
+ set execdir ${prefix}/libexec/${name}
+ xinstall -d ${destroot}${execdir}
+ xinstall -m 755 -W ${worksrcpath} \
+ word2vec word2phrase distance word-analogy compute-accuracy \
+ demo-analogy.sh demo-classes.sh demo-phrase-accuracy.sh \
+ demo-phrases.sh demo-word-accuracy.sh demo-word.sh \
+ ${destroot}${execdir}
+ set exdir ${prefix}/share/examples/${name}
+ xinstall -d ${destroot}${exdir}
+ xinstall -m 644 -W ${worksrcpath} \
+ questions-phrases.txt questions-words.txt \
+ ${destroot}${exdir}
+ # fix demo scripts.
+ foreach f [glob ${destroot}${execdir}/demo-*.sh] {
+ reinplace "s|@EXECDIR@|${execdir}|g" ${f}
+ reinplace "s|@EXDIR@|${exdir}|g" ${f}
+ }
+ set docdir ${prefix}/share/doc/${name}
+ xinstall -d ${destroot}${docdir}
+ xinstall -m 644 -W ${worksrcpath} \
+ LICENSE README.txt \
+ ${destroot}${docdir}
+}
+
+livecheck.url none
Property changes on: trunk/dports/textproc/word2vec/Portfile
___________________________________________________________________
Added: svn:keywords
+ Id
Added: svn:eol-style
+ native
Added: trunk/dports/textproc/word2vec/files/patch-compute-accuracy.c.diff
===================================================================
--- trunk/dports/textproc/word2vec/files/patch-compute-accuracy.c.diff (rev 0)
+++ trunk/dports/textproc/word2vec/files/patch-compute-accuracy.c.diff 2014-02-23 14:36:18 UTC (rev 117348)
@@ -0,0 +1,12 @@
+--- compute-accuracy.c.orig 2014-02-23 20:29:23.000000000 +0900
++++ compute-accuracy.c 2014-02-23 20:30:44.000000000 +0900
+@@ -28,7 +28,8 @@
+ FILE *f;
+ char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch;
+ float dist, len, bestd[N], vec[max_size];
+- long long words, size, a, b, c, d, b1, b2, b3, threshold = 0;
++ long long words, size, b, c, d, b1, b2, b3, threshold = 0;
++ volatile long long a = 0;
+ float *M;
+ char *vocab;
+ int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0;
Added: trunk/dports/textproc/word2vec/files/patch-demo.diff
===================================================================
--- trunk/dports/textproc/word2vec/files/patch-demo.diff (rev 0)
+++ trunk/dports/textproc/word2vec/files/patch-demo.diff 2014-02-23 14:36:18 UTC (rev 117348)
@@ -0,0 +1,116 @@
+--- demo-analogy.sh.orig 2014-02-22 20:36:04.000000000 +0900
++++ demo-analogy.sh 2014-02-22 20:27:27.000000000 +0900
+@@ -1,11 +1,13 @@
+-make
++#!/bin/sh
+ if [ ! -e text8 ]; then
+- wget http://mattmahoney.net/dc/text8.zip -O text8.gz
+- gzip -d text8.gz -f
++ curl -O http://mattmahoney.net/dc/text8.zip
++ unzip text8.zip
+ fi
+ echo -----------------------------------------------------------------------------------------------------
+ echo Note that for the word analogy to perform well, the models should be trained on much larger data sets
+ echo Example input: paris france berlin
+ echo -----------------------------------------------------------------------------------------------------
+-time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
+-./word-analogy vectors.bin
++if [ ! -e vectors.bin ]; then
++ time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
++fi
++ at EXECDIR@/word-analogy vectors.bin
+--- demo-classes.sh.orig 2014-02-22 20:36:09.000000000 +0900
++++ demo-classes.sh 2014-02-22 20:22:53.000000000 +0900
+@@ -1,8 +1,8 @@
+-make
++#!/bin/sh
+ if [ ! -e text8 ]; then
+- wget http://mattmahoney.net/dc/text8.zip -O text8.gz
+- gzip -d text8.gz -f
++ curl -O http://mattmahoney.net/dc/text8.zip
++ unzip text8.zip
+ fi
+-time ./word2vec -train text8 -output classes.txt -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -classes 500
++time @EXECDIR@/word2vec -train text8 -output classes.txt -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -classes 500
+ sort classes.txt -k 2 -n > classes.sorted.txt
+ echo The word classes were saved to file classes.sorted.txt
+--- demo-phrase-accuracy.sh.orig 2014-02-22 20:36:25.000000000 +0900
++++ demo-phrase-accuracy.sh 2014-02-22 20:29:40.000000000 +0900
+@@ -1,12 +1,14 @@
+-make
++#!/bin/sh
+ if [ ! -e text8 ]; then
+- wget http://mattmahoney.net/dc/text8.zip -O text8.gz
+- gzip -d text8.gz -f
++ curl -O http://mattmahoney.net/dc/text8.zip
++ unzip text8.zip
+ fi
+ echo ----------------------------------------------------------------------------------------------------------------
+ echo Note that the accuracy and coverage of the test set questions is going to be low with this small training corpus
+ echo To achieve better accuracy, larger training set is needed
+ echo ----------------------------------------------------------------------------------------------------------------
+-time ./word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2 -min-count 3
+-time ./word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -min-count 3
+-./compute-accuracy vectors-phrase.bin <questions-phrases.txt
++if [ ! -e vectors-phrase.bin ]; then
++ time @EXECDIR@/word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2 -min-count 3
++ time @EXECDIR@/word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -min-count 3
++fi
++ at EXECDIR@/compute-accuracy vectors-phrase.bin < @EXDIR@/questions-phrases.txt
+--- demo-phrases.sh.orig 2014-02-22 20:36:17.000000000 +0900
++++ demo-phrases.sh 2014-02-22 20:30:19.000000000 +0900
+@@ -1,8 +1,10 @@
+-make
++#!/bin/sh
+ if [ ! -e text8 ]; then
+- wget http://mattmahoney.net/dc/text8.zip -O text8.gz
+- gzip -d text8.gz -f
++ curl -O http://mattmahoney.net/dc/text8.zip
++ unzip text8.zip
+ fi
+-time ./word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2
+-time ./word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
+-./distance vectors-phrase.bin
+\ No newline at end of file
++if [ ! -e vectors-phrase.bin ]; then
++ time @EXECDIR@/word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2
++ time @EXECDIR@/word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
++fi
++ at EXECDIR@/distance vectors-phrase.bin
+--- demo-word-accuracy.sh.orig 2014-02-22 20:36:32.000000000 +0900
++++ demo-word-accuracy.sh 2014-02-22 20:31:16.000000000 +0900
+@@ -1,8 +1,10 @@
+-make
++#!/bin/sh
+ if [ ! -e text8 ]; then
+- wget http://mattmahoney.net/dc/text8.zip -O text8.gz
+- gzip -d text8.gz -f
++ curl -O http://mattmahoney.net/dc/text8.zip
++ unzip text8.zip
+ fi
+-time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
+-./compute-accuracy vectors.bin 30000 < questions-words.txt
+-# to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt
++if [ ! -e vectors.bin ]; then
++ time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
++fi
++ at EXECDIR@/compute-accuracy vectors.bin 30000 < @EXDIR@/questions-words.txt
++# to compute accuracy with the full vocabulary, use: @EXECDIR@/compute-accuracy vectors.bin < @EXDIR@/questions-words.txt
+--- demo-word.sh.orig 2014-02-22 20:36:47.000000000 +0900
++++ demo-word.sh 2014-02-22 20:31:57.000000000 +0900
+@@ -1,7 +1,9 @@
+-make
++#!/bin/sh
+ if [ ! -e text8 ]; then
+- wget http://mattmahoney.net/dc/text8.zip -O text8.gz
+- gzip -d text8.gz -f
++ curl -O http://mattmahoney.net/dc/text8.zip
++ unzip text8.zip
+ fi
+-time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
+-./distance vectors.bin
+\ No newline at end of file
++if [ ! -e vectors.bin ]; then
++ time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
++fi
++ at EXECDIR@/distance vectors.bin
Added: trunk/dports/textproc/word2vec/files/patch-malloc.diff
===================================================================
--- trunk/dports/textproc/word2vec/files/patch-malloc.diff (rev 0)
+++ trunk/dports/textproc/word2vec/files/patch-malloc.diff 2014-02-23 14:36:18 UTC (rev 117348)
@@ -0,0 +1,33 @@
+--- compute-accuracy.c.orig 2014-02-22 19:15:25.000000000 +0900
++++ compute-accuracy.c 2014-02-22 19:17:40.000000000 +0900
+@@ -16,7 +16,7 @@
+ #include <stdlib.h>
+ #include <string.h>
+ #include <math.h>
+-#include <malloc.h>
++#include <stdlib.h>
+ #include <ctype.h>
+
+ const long long max_size = 2000; // max length of strings
+--- distance.c.orig 2014-02-22 19:15:32.000000000 +0900
++++ distance.c 2014-02-22 19:16:29.000000000 +0900
+@@ -15,7 +15,7 @@
+ #include <stdio.h>
+ #include <string.h>
+ #include <math.h>
+-#include <malloc.h>
++#include <stdlib.h>
+
+ const long long max_size = 2000; // max length of strings
+ const long long N = 40; // number of closest words that will be shown
+--- word-analogy.c.orig 2014-02-22 19:15:49.000000000 +0900
++++ word-analogy.c 2014-02-22 19:17:27.000000000 +0900
+@@ -15,7 +15,7 @@
+ #include <stdio.h>
+ #include <string.h>
+ #include <math.h>
+-#include <malloc.h>
++#include <stdlib.h>
+
+ const long long max_size = 2000; // max length of strings
+ const long long N = 40; // number of closest words that will be shown
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.macosforge.org/pipermail/macports-changes/attachments/20140223/a0d52bdd/attachment-0001.html>
More information about the macports-changes
mailing list