;; Nada Amin (namin@mit.edu) ;; 6.945 Problem Set 1 ;; Due: Wed. 13 Feb. 2008 ;; Problem 1.1 (define (r:* expr) (r:repeat 0 #f expr)) (define (r:+ expr) (r:repeat 1 #f expr)) #| Test cases: (pp (r:grep (r:seq (r:+ (r:quote "cat")) (r:* (r:quote "dog")) (r:+ (r:quote "cat"))) "tests.txt")) ("[09]. catdogcat" "[10]. catcatdogdog" "[12]. catcatcatdogdogdog" "[13]. acatdogdogcats") ;Unspecified return value (pp (r:grep (r:seq (r:+ (r:quote "dog")) (r:* (r:quote "cat")) (r:+ (r:quote "dog"))) "tests.txt")) ("[10]. catcatdogdog" "[11]. dogdogcatdogdog" "[12]. catcatcatdogdogdog" "[13]. acatdogdogcats" "[14]. ifacatdogdogs" "[15]. acatdogdogsme") ;Unspecified return value (pp (r:grep (let ((digit (r:char-from (string->char-set "0123456789")))) (r:seq (r:bol) (r:quote "[") digit digit (r:quote "]") (r:quote ".") (r:quote " ") (r:* (r:alt "cat" "dog")) (r:eol))) "tests.txt")) ("[09]. catdogcat" "[10]. catcatdogdog" "[11]. dogdogcatdogdog" "[12]. catcatcatdogdogdog") ;Unspecified return value |# ;; Problem 1.2 #| a. Louis' idea would trigger an infinite loop on (r:repeat 0 1 expr) as (r:repeat 0 1 expr) would call (r:repeat 0 1 expr). b. Bonnie's proposal has the following advantages over Alyssa's: - The data produced is typically shorter, because matching n exprs makes use of matching n-1 exprs. - The code is simpler to write, because there is no need to generate a separate alternative for each possible number of matches. c. Ben's proposal has the advantage of re-using a feature of the underlying regular expression language instead of re-implementing this feature. In addition, Ben's proposal relies on a feature available for both BREs and EREs while Alyssa's and Bonnie's proposals rely on a feature only available for EREs. |# ;; d. (define (r:repeat min max expr) (if (not (exact-nonnegative-integer? min)) (error "Min must be non-negative integer:" min)) (if max (begin (if (not (exact-nonnegative-integer? max)) (error "Max must be non-negative integer:" max)) (if (not (<= min max)) (error "Min not less than max:" min max)))) (r:seq expr "\\{" (number->string min) "," (if max (number->string max) "") "\\}")) #| The new procedures work on all the original test cases. Here are a few more: (r:repeat 1 #f (r:quote "foo")) ;Value: "\\(\\(foo\\)\\{1,\\}\\)" (r:repeat 0 3 (r:seq (r:quote "a") (r:quote "b"))) ;Value: "\\(\\(\\(a\\)\\(b\\)\\)\\{0,3\\}\\)" (r:repeat 3 3 (r:alt (r:quote "a") (r:quote "b"))) ;Value: "\\(\\(\\(a\\)\\|\\(b\\)\\)\\{3,3\\}\\)" |# ;; Problem 1.3 #| A subtle case to watch out for is that we cannot decide at the time of constructing an expression whether its parentheses are superfluous. Indeed, (r:seq (r:quote "foo") (r:quote "bar")) generates \(\(foo\)\(bar\)\) which can be reduced to foobar, while (r:* (r:quote "foo")) generates \(\(foo\)\{0,\}\) which is not the same as \(foo\{0,\}\). So here, we don't know when building (r:quote "foo") whether to include parentheses or not. We cannot even make the building of parentheses the responsibility of the parent in which an expression is included, because in (r:seq (r:alt (r:quote "foo") (r:quote "baz")) (r:quote "bar")), the r:seq would have to decide to parenthesize the first sub-expression while in the previous example, it didn't need to. I change the intermediary representation of r:seq, r:alt and r:repeat to a node data structure. I keep the intermediary representation as strings for regular expressions representing a single character (or meta-character), so that I don't need to change the procedures r:dot, r:bol, r:eol, r:char-from, r:char-not-from. For r:quote, I only call r:seq if the quoted string consists of more than one character. The node data structure is a list '(node ,type ,children ,compiler), where children is a list of children node structures and compiler is a procedure that takes in a list of the compiled children (i.e. a list of strings) and return a string that represents the compiled node. Having the node structure carry its own compiler makes it easier to compile a node, because I don't need to do a dispatch on the node type. |# (define (make-node type children compiler) (list 'node type children compiler)) (define (node? x) (and (pair? x) (eq? (car x) 'node))) (define node-type cadr) (define node-children caddr) (define node-compiler cadddr) (define (r:seq . exprs) (define (compiler sexprs) (apply string-append sexprs)) (make-node 'r:seq exprs compiler)) (define (r:alt . exprs) (define (compiler sexprs) (if (pair? sexprs) (apply string-append (cons (car sexprs) (append-map (lambda (sexpr) (list "\\|" sexpr)) (cdr sexprs)))) (string-append))) (make-node 'r:alt exprs compiler)) (define (r:repeat min max expr) (define (compiler sexprs) (let ((sexpr (car sexprs))) (string-append sexpr "\\{" (number->string min) "," (if max (number->string max) "") "\\}"))) (if (not (exact-nonnegative-integer? min)) (error "Min must be non-negative integer:" min)) (if max (begin (if (not (exact-nonnegative-integer? max)) (error "Max must be non-negative integer:" max)) (if (not (<= min max)) (error "Min not less than max:" min max)))) (make-node 'r:repeat (list expr) compiler)) ;; only call r:seq if the length of the string is > 1 (define (r:quote string) (let ((end (string-length string))) (let ((res (call-with-output-string ; see RefMan section 14.3 (lambda (port) (do ((i 0 (+ i 1))) ((not (< i end))) ; see RefMan 2.9 (let ((c (string-ref string i))) (if (or (char=? c #\.) (char=? c #\[) (char=? c #\\) (char=? c #\^) (char=? c #\$) (char=? c #\*)) (write-char #\\ port)) (write-char c port))))))) (if (> end 1) (r:seq res) res)))) (define (%parenthesize? op-current op-parent) (define (precedence-rank op) (define precedence (list 'r:repeat 'r:seq 'r:alt #f)) (length (member op precedence))) (< (precedence-rank op-current) (precedence-rank op-parent))) (define (%maybe-parenthesize ok sexpr) (if ok (string-append "\\(" sexpr "\\)") sexpr)) (define (compile expr) (define (compile-children-within exprs parent) (map (lambda (expr) (compile-within expr parent)) exprs)) (define (compile-within expr parent) (if (not (node? expr)) expr (let* ((current (node-type expr)) (parenthesize? (%parenthesize? current parent)) (children (compile-children-within (node-children expr) current)) (sexpr ((node-compiler expr) children))) (%maybe-parenthesize parenthesize? sexpr)))) (compile-within expr #f)) ;; first compile internal representation (define (r:grep-like program options iexpr filename) (let ((expr (compile iexpr)) (port (open-output-string))) (and (= (run-synchronous-subprocess program (append options (list "-e" expr (->namestring filename))) 'output port) 0) (r:split-lines (get-output-string port))))) #| The new version works on all the original test cases. It handles the subtleties: (display (compile (r:seq (r:quote "foo") (r:quote "bar")))) foobar ;Unspecified return value (display (compile (r:* (r:quote "foo")))) \(foo\)\{0,\} ;Unspecified return value (display (compile (r:seq (r:alt (r:quote "foo") (r:quote "baz")) (r:quote "bar")))) \(foo\|baz\)bar ;Unspecified return value |# ;; Problem 1.4 #| I support back-references with the procedures (r:label key expr) and (r:ref key). |# ;; labels the given expr with the given key ;; and evaluates to the given expr ;; overwrites key if already used (define (r:label key expr) (make-label key expr)) (define (make-label key expr) (list 'label key expr)) (define (label? x) (and (pair? x) (eq? (car x) 'label))) (define label-key cadr) (define label-expr caddr) ;; evaluates to the last expression labelled with the given key (define (r:ref key) (make-ref key)) (define (make-ref key) (list 'ref key)) (define (ref? x) (and (pair? x) (eq? (car x) 'ref))) (define ref-key cadr) ;; change the compiler to support labels and references ;; need to count parentheses and keep track of labels (define (compile expr) (define (make-sub data counter labels) (list data counter labels)) (define sub-data car) (define sub-counter cadr) (define sub-labels caddr) (define (compile-children-within exprs counter labels parent) (if (null? exprs) (make-sub '() counter labels) (let* ((sub1 (compile-within (car exprs) counter labels parent)) (subr (compile-children-within (cdr exprs) (sub-counter sub1) (sub-labels sub1) parent))) (make-sub (cons (sub-data sub1) (sub-data subr)) (sub-counter subr) (sub-labels subr))))) (define (compile-within expr counter labels parent) (if (not (node? expr)) (cond ((label? expr) (let ((new-counter (+ counter 1))) (if (> new-counter 9) (error "cannot label beyond digits" (label-key expr)) (let ((sub (compile-within (label-expr expr) new-counter (cons (list (label-key expr) new-counter) labels) #f))) (make-sub (%maybe-parenthesize #t (sub-data sub)) (sub-counter sub) (sub-labels sub)))))) ((ref? expr) (let ((key-val (assoc (ref-key expr) labels))) (if (not key-val) (error "invalid reference" (ref-key expr)) (make-sub (string-append "\\" (number->string (cadr key-val))) counter labels)))) (else (make-sub expr counter labels))) (let* ((current (node-type expr)) (parenthesize? (%parenthesize? current parent)) (children-sub (compile-children-within (node-children expr) (if parenthesize? (+ counter 1) counter) labels current)) (sexpr ((node-compiler expr) (sub-data children-sub)))) (make-sub (%maybe-parenthesize parenthesize? sexpr) (sub-counter children-sub) (sub-labels children-sub))))) (sub-data (compile-within expr 0 '() #f))) #| The new version works on all the original test cases. Specific test cases: (display (compile (r:seq (r:label 'f (r:quote "f")) (r:ref 'f)))) \(f\)\1 ;Unspecified return value (display (compile (r:seq (r:label 'animal (r:alt (r:quote "cat") (r:quote "dog"))) (r:ref 'animal)))) \(cat\|dog\)\1 ;Unspecified return value (display (compile (let ((animal (r:alt (r:quote "cat") (r:quote "dog")))) (r:seq (r:* animal) (r:label 'animal animal) (r:ref 'animal))))) \(cat\|dog\)\{0,\}\(cat\|dog\)\2 ;Unspecified return value (pp (r:grep (r:seq (r:label 'animal (r:alt (r:quote "cat") (r:quote "dog"))) (r:ref 'animal)) "tests.txt")) ("[10]. catcatdogdog" "[11]. dogdogcatdogdog" "[12]. catcatcatdogdogdog" "[13]. acatdogdogcats" "[14]. ifacatdogdogs" "[15]. acatdogdogsme") ;Unspecified return value (pp (r:grep (let ((animal (r:alt (r:quote "cat") (r:quote "dog")))) (r:seq (r:* animal) (r:label 'animal animal) (r:ref 'animal))) "tests.txt")) ("[10]. catcatdogdog" "[11]. dogdogcatdogdog" "[12]. catcatcatdogdogdog" "[13]. acatdogdogcats" "[14]. ifacatdogdogs" "[15]. acatdogdogsme") ;Unspecified return value (pp (r:grep (r:seq (r:label 'animal (r:alt (r:quote "cat") (r:quote "dog"))) (r:ref 'animal) (r:ref 'animal)) "tests.txt")) ("[12]. catcatcatdogdogdog") ;Unspecified return value |# ;; Problem 1.5 #| a. BREs and EREs significantly differ in that the metacharacters {, }, |, (, and ) are special in BREs when backslashed and special in EREs when NOT backslashed. Here is a summary: metacharacter... BREs EREs ... backslashed special not special ... not backslashed not special special This is a pain, because r:quote and %char-from need to know whether to generate a BRE or a ERE, as, in case of a ERE, the metacharacters need to be backslashed in order to loose their special meaning. b. I refactor the backend so that the primitives and compilers, instead of just directly returning a string, return a procedure that takes in the back end and returns the string of the regular expression for the requested backend. |# ;; c. (define metacharacters (list #\{ #\} #\| #\( #\))) (define (r:dot) (lambda (basic?) ".")) (define (r:bol) (lambda (basic?) "^")) (define (r:eol) (lambda (basic?) "$")) (define (r:quote string) (let ((end (string-length string))) (let ((res (lambda (basic?) (call-with-output-string ; see RefMan section 14.3 (lambda (port) (do ((i 0 (+ i 1))) ((not (< i end))) ; see RefMan 2.9 (let ((c (string-ref string i))) (if (or (char=? c #\.) (char=? c #\[) (char=? c #\\) (char=? c #\^) (char=? c #\$) (char=? c #\*)) (write-char #\\ port) (if (and (not basic?) (member c metacharacters)) (write-char #\\ port))) (write-char c port)))))))) (if (> end 1) (r:seq res) res)))) (define (%char-from negate? members) (define (escape-meta basic? others) (if basic? others (if (null? others) '() (let ((rest (cons (car others) (escape-meta basic? (cdr others))))) (if (member (car others) metacharacters) (cons #\\ rest) rest))))) (let ((right? (memv #\] members)) (caret? (memv #\^ members)) (hyphen? (memv #\- members)) (others (delete-matching-items members (lambda (c) (or (char=? c #\]) (char=? c #\^) (char=? c #\-)))))) (lambda (basic?) (if (and caret? hyphen? (not right?) (not negate?) (null? others)) "[-^]" (string-append "[" (if negate? "^" "") (if right? "]" "") (list->string (escape-meta basic? others)) (if caret? "^" "") (if hyphen? "-" "") "]"))))) (define (r:seq . exprs) (define (compiler sexprs) (lambda (basic?) (apply string-append sexprs))) (make-node 'r:seq exprs compiler)) (define (r:alt . exprs) (define (compiler sexprs) (lambda (basic?) (if (pair? sexprs) (apply string-append (cons (car sexprs) (append-map (lambda (sexpr) (list (if basic? "\\|" "|") sexpr)) (cdr sexprs)))) (string-append)))) (make-node 'r:alt exprs compiler)) (define (r:repeat min max expr) (define (compiler sexprs) (lambda (basic?) (let ((sexpr (car sexprs))) (string-append sexpr (if basic? "\\{" "{") (number->string min) "," (if max (number->string max) "") (if basic? "\\}" "}"))))) (if (not (exact-nonnegative-integer? min)) (error "Min must be non-negative integer:" min)) (if max (begin (if (not (exact-nonnegative-integer? max)) (error "Max must be non-negative integer:" max)) (if (not (<= min max)) (error "Min not less than max:" min max)))) (make-node 'r:repeat (list expr) compiler)) (define (%maybe-parenthesize basic? ok sexpr) (if ok (string-append (if basic? "\\(" "(") sexpr (if basic? "\\)" ")")) sexpr)) (define (compile basic? expr) (define (make-sub data counter labels) (list data counter labels)) (define sub-data car) (define sub-counter cadr) (define sub-labels caddr) (define (compile-children-within exprs counter labels parent) (if (null? exprs) (make-sub '() counter labels) (let* ((sub1 (compile-within (car exprs) counter labels parent)) (subr (compile-children-within (cdr exprs) (sub-counter sub1) (sub-labels sub1) parent))) (make-sub (cons (sub-data sub1) (sub-data subr)) (sub-counter subr) (sub-labels subr))))) (define (compile-within expr counter labels parent) (if (not (node? expr)) (cond ((label? expr) (let ((new-counter (+ counter 1))) (if (> new-counter 9) (error "cannot label beyond digits" (label-key expr)) (let ((sub (compile-within (label-expr expr) new-counter (cons (list (label-key expr) new-counter) labels) #f))) (make-sub (%maybe-parenthesize basic? #t (sub-data sub)) (sub-counter sub) (sub-labels sub)))))) ((ref? expr) (let ((key-val (assoc (ref-key expr) labels))) (if (not key-val) (error "invalid reference" (ref-key expr)) (make-sub (string-append "\\" (number->string (cadr key-val))) counter labels)))) (else (make-sub (expr basic?) counter labels))) (let* ((current (node-type expr)) (parenthesize? (%parenthesize? current parent)) (children-sub (compile-children-within (node-children expr) (if parenthesize? (+ counter 1) counter) labels current)) (sexpr (((node-compiler expr) (sub-data children-sub)) basic?))) (make-sub (%maybe-parenthesize basic? parenthesize? sexpr) (sub-counter children-sub) (sub-labels children-sub))))) (sub-data (compile-within expr 0 '() #f))) (define (r:grep expr filename) (r:grep-like #t "grep" '() expr filename)) (define (r:egrep expr filename) (if (eq? microcode-id/operating-system 'nt) (r:grep-like #f "grep" '("-E") expr filename) (r:grep-like #f "egrep" '() expr filename))) (define (r:grep-like basic? program options iexpr filename) (let ((expr (compile basic? iexpr)) (port (open-output-string))) (and (= (run-synchronous-subprocess program (append options (list "-e" expr (->namestring filename))) 'output port) 0) (r:split-lines (get-output-string port))))) #| Original test cases work, except that the base strings in the handout must be quoted: (pp (r:grep (r:seq (r:quote " ") (r:repeat 3 5 (r:alt (r:quote "cat") (r:quote "dog"))) (r:eol)) "tests.txt")) ("[09]. catdogcat" "[10]. catcatdogdog" "[11]. dogdogcatdogdog") ;Unspecified return value (pp (r:grep (let ((digit (r:char-from (string->char-set "0123456789")))) (r:seq (r:bol) (r:quote "[") digit digit (r:quote "]") (r:quote ".") (r:quote " ") (r:char-from (char-set #\a #\b)) (r:repeat 3 5 (r:alt (r:quote "cat") (r:quote "dog"))) (r:char-not-from (char-set #\d #\e #\f)) (r:eol))) "tests.txt")) ("[13]. acatdogdogcats") ;Unspecified return value (pp (r:egrep (r:alt (r:quote "foo") (r:quote "bar") (r:quote "baz")) "tests.txt")) ("[05]. foo" "[06]. bar" "[07]. foo bar baz quux") ;Unspecified return value (pp (r:egrep (r:repeat 3 5 (r:alt (r:quote "cat") (r:quote "dog"))) "tests.txt")) ("[09]. catdogcat" "[10]. catcatdogdog" "[11]. dogdogcatdogdog" "[12]. catcatcatdogdogdog" "[13]. acatdogdogcats" "[14]. ifacatdogdogs" "[15]. acatdogdogsme") ;Unspecified return value (pp (r:egrep (r:seq (r:quote " ") (r:repeat 3 5 (r:alt (r:quote "cat") (r:quote "dog"))) (r:eol)) "tests.txt")) ("[09]. catdogcat" "[10]. catcatdogdog" "[11]. dogdogcatdogdog") ;Unspecified return value (pp (r:egrep (let ((digit (r:char-from (string->char-set "0123456789")))) (r:seq (r:bol) (r:quote "[") digit digit (r:quote "]") (r:quote ".") (r:quote " ") (r:char-from (char-set #\a #\b)) (r:repeat 3 5 (r:alt (r:quote "cat") (r:quote "dog"))) (r:char-not-from (char-set #\d #\e #\f)) (r:eol))) "tests.txt")) ("[13]. acatdogdogcats") ;Unspecified return value Here are additional test cases: (display (compile #t (r:quote "hello|"))) hello| ;Unspecified return value (display (compile #f (r:quote "hello|"))) hello\| ;Unspecified return value I added the test case [16]. hello|bye (pp (r:grep (r:quote "hello|") "tests.txt")) ("[16]. hello|bye") ;Unspecified return value (pp (r:egrep (r:quote "hello|") "tests.txt")) ("[16]. hello|bye") ;Unspecified return value (display (compile #t (r:seq (r:label 'f (r:quote "f")) (r:ref 'f)))) \(f\)\1 ;Unspecified return value (display (compile #f (r:seq (r:label 'f (r:quote "f")) (r:ref 'f)))) (f)\1 ;Unspecified return value (pp (r:egrep (r:seq (r:label 'animal (r:alt (r:quote "cat") (r:quote "dog"))) (r:ref 'animal)) "tests.txt")) ("[10]. catcatdogdog" "[11]. dogdogcatdogdog" "[12]. catcatcatdogdogdog" "[13]. acatdogdogcats" "[14]. ifacatdogdogs" "[15]. acatdogdogsme") ;Unspecified return value |#