;;; -*- Mode: Lisp; Package: CL-USER -*- ;;; --------------------------------------------------------------------------- ;;; File: class.lisp ;;; Description: Map grids into features ;;; Author: Regina Barzilay ;;; Date: 9-04, 11-06 ;;; Package: CL-USER ;;; --------------------------------------------------------------------------- (in-package :cl-user) (defconstant *permutation-number* 10) ;; destructive single-arg mapcar (map-into doesn't work in acl4) (defun nmapcar (func lst) (do ((l lst (cdr l))) ((null l) lst) (setf (car l) (funcall func (car l))))) (defun list-of-chars (obj) (cond ((consp obj) obj) ((stringp obj) (map 'list #'identity obj)) (t (list obj)))) ;; get a string and a list of separators and explode it according to it, return ;; a list of strings. (defun explode-string (str &optional (separators #\space)) (let ((separators (list-of-chars separators)) (start nil) (res nil)) (dotimes (i (length str)) (let* ((ch (char str i)) (sep? (member ch separators))) (cond ((and sep? start) (push (subseq str start i) res) (setf start nil)) ((not (or sep? start)) (setf start i))))) (when start (push (subseq str start) res)) (nreverse res))) (defun read-file (file) (with-open-file (inp file :direction :input :if-does-not-exist :error) (let (line res) (loop do (setf line (read-line inp nil nil)) until (null line) do (when line (push (explode-string line) res))) (nreverse res)))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; This family of functions generates n=gram features given a grid (defvar ngram-letters '(#\X #\O #\S #\-)) (defun generate-ngram-marker (n) (labels ((doit (n) (if (= n 0) '(()) (reduce #'append (mapcar #'(lambda (l) (mapcar #'(lambda (m) (cons l m)) (doit (1- n)))) ngram-letters))))) (mapcar #'(lambda (l) (coerce l 'string)) (doit n)))) (defun ngram-maker (n) (let ((inter (generate-ngram-marker n)) (start (mapcar #'(lambda (x) (format nil "0~a" x)) (generate-ngram-marker (1- n)))) (end (mapcar #'(lambda (x) (format nil "~a1" x)) (generate-ngram-marker (1- n))))) (append inter start end))) (defvar *feature-count*) (defun print-ngram-table (n table out) ; (format out "--- Showing ~a-grams:~%" n) (dolist (m (ngram-maker n)) (format out "~a:~a " (incf *feature-count*) (gethash m table 0)))) (defun extract-unigrams (matrix) (let ((table (make-hash-table :test #'equalp))) (when matrix (let ((c (/ 1.0 (* (length matrix) (length (car matrix)))))) (dolist (row matrix) (dolist (letter row) (incf (gethash letter table 0.0) c))))) table)) (defun extract-bigrams (matrix) (let ((table (make-hash-table :test #'equalp))) (when matrix (let ((c (/ 1.0 (* (length matrix) (length (car matrix))))) (prev nil)) (dolist (row matrix) (setf prev "0") (dolist (letter `(,@row "1")) (incf (gethash (concatenate 'string prev letter) table 0) c) (setf prev letter))))) table)) (defun extract-trigrams (matrix) (let ((table (make-hash-table :test #'equalp))) (when matrix (let ((c (/ 1.0 (* (length matrix) (length (car matrix))))) (prev1 nil) (prev2 nil)) (dolist (row matrix) (setf prev1 "0" prev2 "0") (dolist (letter `(,@row "1" "1")) (incf (gethash (concatenate 'string prev1 prev2 letter) table 0) c) (shiftf prev1 prev2 letter))))) table)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; This family of functions tranforms matrices to get alternative ling representations ;; this function removes syntactic annotations from the matrix (defun matrix-2-binary (matrix) (dolist (row matrix) (nsubstitute-if "X" (lambda (l) (member l (list "S" "O") :test #'equal)) row)) matrix) ;; this is a threshold for salience (defconstant *cut-off* 2) ;; this function transforms grid to account for salience if salience flag is on (defun transform-matrix (input out salience) (setf *feature-count* 0) (let ((matrix (mapcar #'cdr input)) (grouped (list () ()))) (dolist (row matrix) (if salience (let ((freq (count-if-not #'(lambda (x) (equalp x "-")) row))) (cond ((>= freq *cut-off*) (push row (first grouped))) (t (push row (second grouped))))) (push row (first grouped)))) (dolist (g grouped) (when g (print-ngram-table 1 (extract-unigrams g) out) ;(format t "~a~%" *feature-count*) (print-ngram-table 2 (extract-bigrams g) out) ;(format t "~a~%" *feature-count*) (print-ngram-table 3 (extract-trigrams g) out))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; This family of functions just parses the directory to find sets of related permutations (defun find-core (s) (let ((s (file-namestring s))) (subseq s 0 (search "-" s :test #'equalp)))) (defun find-same-core-files (l) (let ((cores (remove-duplicates (mapcar #'find-core l) :test #'equalp)) (res nil)) (format t "~a~%" cores) (dolist (core cores) (push (remove-if-not #'(lambda (x) (equalp (find-core x) core)) l) res)) res)) (defun orig-p (s) (let ((s (file-namestring s))) (or (search "-1.grid" (file-namestring s)) (search "-1-p." (file-namestring s))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This family of functions prints features in the format of svm light. (defun generate-features (file out i &key syntax salience) (let ((input (if syntax (read-file file) (matrix-2-binary (read-file file))))) (when input (format out "~a " (if (or (search "-1.grid" (file-namestring file)) (search "-1-p." (file-namestring file))) "2" "1")) (format out "qid:~a " i) (transform-matrix input out salience) (format out "\#~a~%" (file-namestring file))))) (defun generate-rerank (dir f syntax salience) (let ((files (find-same-core-files (directory dir))) (i 0)) (with-open-file (out f :direction :output :if-exists :supersede :if-does-not-exist :create) (dolist (core files) (dotimes (j (- (length core) 1)) (incf i) (generate-features (first core) out i :syntax syntax :salience salience) (generate-features (nth (1+ j) core) out i :syntax syntax :salience salience)))))) (defun grid2features (dir name ana) (generate-rerank dir (format nil "~a-ubt-syntax-salience-~a" name ana) t t) (generate-rerank dir (format nil "~a-ubt-no_syntax-salience-~a" name ana) nil t) (generate-rerank dir (format nil "~a-ubt-syntax-no_salience-~a" name ana) t nil) (generate-rerank dir (format nil "~a-ubt-no_syntax-no_salience-~a" name ana) nil nil)) (defun generate-features-for-all-comb () (grid2features "ana/data2-train/" "revisions/data2.train" "ana") (grid2features "data2-training.parsed.grid/" "revisions/data2.train" "no_ana") (grid2features "ana/data2-test/" "revisions/data2.test" "ana") (grid2features "data2-testing.parsed.grid/" "revisions/data2.test" "no_ana") (grid2features "ana/data1-train/" "revisions/data1.train" "ana") (grid2features "data1-training.parsed.grid/" "revisions/data1.train" "no_ana") (grid2features "ana/data1-test/" "revisions/data1.test" "ana") (grid2features "data1-testing.parsed.grid/" "revisions/data1.test" "no_ana")) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This family of functions run svm light for training and testing and compute accuracy (defun compute-accuracy (file) (let ((predictions (read-file file)) (correct 0)) (dotimes (i (/ (length predictions) 2)) (when (> (read-from-string (car (nth (* 2 i) predictions))) (read-from-string (car (nth (1+ (* 2 i)) predictions)))) (incf correct))) (* 1.0 (/ correct (/ (length predictions) 2))))) (defun run-svm (set conf) (let ((*print-circle* nil)) (run-shell-command (format nil "~~/tools/svm-light/svm_learn -z p revisions/~a.train-~a revisions/~a~a-model > /dev/null" set conf set conf)) (run-shell-command (format nil "~~/tools/svm-light/svm_classify revisions/~a.test-~a revisions/~a~a-model revisions/~a~a-predictions > /dev/null " set conf set conf set conf set conf)) (format t "~%ACCURACY ~a-~a ~a~%" set conf (compute-accuracy (format nil "revisions/~a~a-predictions" set conf))))) (defun run-svm-for-all-comb () (dolist (i (list 1 2)) (dolist (ana (list "ana" "no_ana")) (run-svm (format nil "data~a" i) (format nil "ubt-syntax-salience-~a" ana)) (run-svm (format nil "data~a" i) (format nil "ubt-no_syntax-salience-~a" ana)) (run-svm (format nil "data~a" i) (format nil "ubt-syntax-no_salience-~a" ana)) (run-svm (format nil "data~a" i) (format nil "ubt-no_syntax-no_salience-~a" ana))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This family of functions checks the significance results for each pair of combinations (defun significance-test (prediction1 prediction2) (let ((prediction1 (read-file prediction1)) (prediction2 (read-file prediction2)) (n+ 0) (n- 0)) (dotimes (i (/ (length prediction1) 2)) (let ((p1 (> (read-from-string (car (nth (* 2 i) prediction1))) (read-from-string (car (nth (1+ (* 2 i)) prediction1))))) (p2 (> (read-from-string (car (nth (* 2 i) prediction2))) (read-from-string (car (nth (1+ (* 2 i)) prediction2)))))) (cond ((and p1 (not p2)) (incf n+)) ((and p2 (not p1)) (incf n-))))) (run-shell-command (format nil "~~/tmp/significance ~a ~a~%" n+ n-))))