Created
April 19, 2017 14:41
-
-
Save vseloved/85408fff4e4559555665e3acd906a0e8 to your computer and use it in GitHub Desktop.
Unfinished code for NNSE calculation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(in-package #:nlp.embeddings) | |
(named-readtables:in-readtable rutilsx-readtable) | |
(eval-always | |
(rename-package "BKNR.SKIP-LIST" "BKNR.SKIP-LIST" '("SKLIST"))) | |
(defun gather-freq-dict (vecs dir &key (cutoff 0) (dump-file "/tmp/dict.txt")) | |
(let ((dict #h(equal)) | |
(idxs #h(equal)) | |
(total 0) | |
(i -1)) | |
(dolist (file (uiop:directory-files dir)) | |
(dolines (line file) | |
(dolist (word (split #\Space line)) | |
(:+ (get# (normalize vecs word) dict 0)) | |
(:+ total)))) | |
(rem# nil dict) | |
(format *debug-io* "Freq dict - total words: ~A, distinct words: ~A, ~ | |
after frequency cutoff (~A): ~A~%" | |
total (ht-count dict) cutoff | |
(dotable (word freq dict | |
(ht-count dict)) | |
(when (<= freq cutoff) | |
(rem# word dict)))) | |
(dotable (word freq dict) | |
(:= (? idxs word) (:+ i))) | |
(when dump-file | |
(with-out-file (out dump-file) | |
(format out "~A~%" total) | |
(dotable (word idx idxs) | |
(format out "~A ~A ~A~%" word idx (? dict word))))) | |
(values dict | |
idxs | |
total))) | |
(defun gather-cooc-mat (vecs dir dict idxs total | |
&key (window 10) (weighting 'identity) | |
(dump-file "/tmp/cooc.txt")) | |
(let ((coocs (make-array (ht-count dict) :initial-contents | |
(maptimes (ht-count dict) | |
^(make 'sklist:skip-list)))) | |
(min most-positive-fixnum) | |
(max 0) | |
(total-neighbours 0) | |
(cc -1)) | |
(dolist (file (uiop:directory-files dir)) | |
(dolines (line file) | |
(when (zerop (rem (:+ cc) 100000)) (princ ".")) | |
(let ((words (coerce (remove nil | |
(mapcar ^(? idxs (normalize vecs %)) | |
(split #\Space line))) | |
'vector))) | |
(dotimes (i (length words)) | |
(let ((cooc (? coocs (? words i)))) | |
(iter (:for j :from (max 0 (- i window)) | |
:to (min (1- (length words)))) | |
(unless (= i j) | |
(sklist:skip-list-insert | |
cooc j (1+ (or (sklist:skip-list-search cooc j) | |
0)))))))))) | |
(when dump-file | |
(with-out-file (out dump-file) | |
(dovec (cooc coocs) | |
(let (cur) | |
(sklist:map-skip-list ^(push (fmt "~A:~A" % %%) cur) | |
cooc) | |
(format out "~{~A~^ ~}~%" cur))))) | |
(dovec (cooc coocs) | |
(let ((count (sklist:skip-list-length cooc))) | |
(when (< count min) (:= min count)) | |
(when (> count max) (:= max count)) | |
(:+ total-neighbours count))) | |
(format *debug-io* "Coocurences - max: ~A, min: ~A, mean: ~$~%" | |
max min (/ total-neighbours (length coocs))) | |
coocs)) | |
(defun calc-ppmi-mat (total dict idxs coocs) | |
(let ((freqs #h()) | |
(ppmis (make-array (length coocs)))) | |
(dotable (word idx idxs) | |
(:= (? freqs idx) (? dict word))) | |
(dotable (_ idx idxs) | |
(with ((cooc (? coocs idx)) | |
(ppmi #h())) | |
(sklist:map-skip-list ^(:= (? ppmi %) | |
(if (= % idx) 0 | |
(max (log (/ (* %% total) | |
(* (? freqs idx) | |
(? freqs %))) | |
2) | |
0))) | |
cooc) | |
(:= (? ppmis idx) ppmi))) | |
ppmis)) | |
;;; SVD | |
(defclass sparse-mat () | |
((data :initarg :data :accessor mat-data))) | |
(defmethod fsvd:height-of ((mat sparse-mat) &key densep) | |
(declare (ignore densep)) | |
(length @mat.data)) | |
(defmethod fsvd:width-of ((mat sparse-mat) &key densep) | |
(declare (ignore densep)) | |
(length @mat.data)) | |
(defmethod fsvd:size-of ((mat sparse-mat)) | |
(let ((rez 0)) | |
(dovec (ht @mat.data) | |
(:+ rez (ht-count ht))) | |
rez)) | |
(defmethod fsvd:map-matrix (fn (mat sparse-mat)) | |
(let ((h (fsvd:height-of mat)) | |
(w (fsvd:width-of mat)) | |
(cnt -1)) | |
(loop :for i :below h :do | |
(let ((row (aref (slot-value mat 'data) i))) | |
(when (zerop (rem i 1000)) (princ ".")) | |
(loop :for j :below w :do | |
(when-it (get# j row) | |
(call fn i j it (:+ cnt)))))))) | |
(defmethod fsvd:do-matrix-macro-name ((mat sparse-mat)) | |
'do-sparse-mat) | |
(defmacro do-sparse-mat (((i j val dense-index) mat) | |
&body body) | |
(with ((width (gensym)) | |
(matrix (gensym)) | |
(row (gensym)) | |
(declarations body (fsvd::split-body body))) | |
`(let* ((,matrix ,mat) | |
(,dense-index 0) | |
(,width (the fixnum (fsvd:width-of ,matrix)))) | |
(declare (type (integer 0 #.(1- most-positive-fixnum)) ,dense-index)) | |
(dotimes (,i (the fixnum (fsvd:height-of ,matrix))) | |
(when (zerop (rem ,i 1000)) (princ ".")) | |
(let ((,row (aref (slot-value ,matrix 'data) ,i))) | |
(dotimes (,j ,width) | |
(let ((,val (get# ,j ,row))) | |
,@declarations | |
(when ,val | |
,@body | |
(:+ ,dense-index))))))))) | |
;;; NNSE | |
(defclass nnse (mem-vecs) | |
() | |
(:documentation | |
"NNSE word embeddings.")) | |
(defun nnse-cost (mat a d) | |
(mat:nrm2 (mat:m- mat | |
(mat:m* a d)))) | |
(defun col-unit-norm! (mat) | |
"Rescale each column of MAT to unit L2 norm." | |
(dotimes (j (? (mat:mat-dimensions mat) 1)) | |
(let ((total 0)) | |
(dotimes (i (? (mat:mat-dimensions mat) 0)) | |
(:+ total (expt (mat:mref mat i j) 2))) | |
(dotimes (i (? (mat:mat-dimensions mat) 0)) | |
(:/ (mat:mref mat i j) (sqrt total))))) | |
mat) | |
(defun row-unit-norm! (mat) | |
"Rescale each row of MAT to unit L2 norm." | |
(dotimes (i (? (mat:mat-dimensions mat) 0)) | |
(let ((total 0)) | |
(dotimes (j (? (mat:mat-dimensions mat) 1)) | |
(:+ total (expt (mat:mref mat i j) 2))) | |
(dotimes (j (? (mat:mat-dimensions mat) 1)) | |
(:/ (mat:mref mat i j) total)))) | |
mat) | |
(defun calc-nnsc (mat k &key (sparsity 0.7) (inner-steps 100) | |
(ep 1e-3) (max-steps 1000)) | |
(with ((step 0) | |
(d (? (mat:mat-dimensions mat) 0)) | |
(n (? (mat:mat-dimensions mat) 1)) | |
(W (mat:make-mat (list d k))) | |
(H (mat:make-mat (list k n))) | |
(cost 0) | |
(prev 0)) | |
(dotimes (i k) | |
(dotimes (j n) | |
(:= (mat:mref H i j) (+ 1e-10 (random 0.1))))) | |
(:= cost (nnse-cost mat W H)) | |
;; optimize | |
(loop :while (and (< step max-steps) | |
(> (abs (- cost prev)) ep)) :do | |
(format *debug-io* "Step ~A cost: ~A~%" step cost) | |
(dotimes (i d) | |
(dotimes (j k) | |
(:= (mat:mref W i j) 1)));(+ 1e-10 (random 0.1))))) | |
;; initial A optimization | |
(loop :repeat inner-steps :do | |
(mat:.*! (mat:.*! (mat:m* mat H :transpose-b? t) | |
(mat:.expt! (mat:m* (mat:m* W H) | |
H :transpose-b? t) | |
-1)) | |
W)) | |
;; enforce sparcity constraints | |
(dotimes (j k) | |
(let (vals) | |
(dotimes (i d) | |
(push (pair i (mat:mref W i j)) vals)) | |
(:= vals (coerce (sort vals '< :key 'rt) 'vector)) | |
(dotimes (i (floor (* sparsity d))) | |
(:= (mat:mref W (? vals i 0) j) 0)))) | |
;; stepwise W & H optimization | |
(loop :repeat inner-steps :do | |
(let ((div (mat:m* W | |
(mat:m* H H | |
:transpose-b? t)))) | |
(dotimes (i d) | |
(dotimes (j k) | |
(when (zerop (mat:mref div i j)) | |
(:= (mat:mref div i j) 1)))) | |
(mat:.*! (mat:.*! (mat:m* mat H :transpose-b? t) | |
(mat:.expt! div -1)) | |
W)) | |
(mat:.*! (mat:.*! (mat:m* W mat :transpose-a? t) | |
(mat:.expt! (mat:m* (mat:m* W W :transpose-a? t) | |
H) | |
-1)) | |
H) | |
(print (list W H))) | |
;; (mat:axpy! (- learning-rate) | |
;; (mat:m* (mat:m- (mat:m* a d) | |
;; mat) | |
;; d | |
;; :transpose-b? t) | |
;; a) | |
;; (dotimes (i (? (mat:mat-dimensions mat) 0)) | |
;; (dotimes (j dims) | |
;; (when (minusp (mat:mref a i j)) | |
;; (:= (mat:mref a i j) 0)))) | |
;; (col-unit-norm! a) | |
;; (mat:axpy! (- learning-rate) | |
;; (mat:m* a | |
;; (mat:m- (mat:m* a s) | |
;; mat) | |
;; :transpose-a? t) | |
;; s) | |
;; (dotimes (i dims) | |
;; (dotimes (j (? (mat:mat-dimensions mat) 1)) | |
;; (when (minusp (mat:mref s i j)) | |
;; (:= (mat:mref s i j) 0)))) | |
;; (row-unit-norm! s) | |
;; (mat:axpy! learning-rate | |
;; (mat:m- (mat:m* a mat :transpose-a? t) | |
;; (mat:.+! la | |
;; (mat:m* (mat:m* a a :transpose-a? t) | |
;; d))) | |
;; d) | |
;; (mat:.*! (mat:.*! (mat:m* a mat :transpose-a? t) | |
;; (mat:.expt! (mat:.+! la | |
;; (mat:m* (mat:m* a a :transpose-a? t) | |
;; d)) | |
;; -1)) | |
;; d) | |
;; (row-unit-norm! d) | |
(:+ step) | |
(:= prev cost | |
cost (nnse-cost mat W H))) | |
(format *debug-io* "Final cost: ~A~%" cost) | |
(values W | |
H))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment