;; Nada Amin (namin@mit.edu)
;; 6.945 Problem Set 1
;; Due: Wed. 13 Feb. 2008

;; Problem 1.1

(define (r:* expr)
  (r:repeat 0 #f expr))

(define (r:+ expr)
  (r:repeat 1 #f expr))

#|

Test cases:

(pp (r:grep
     (r:seq (r:+ (r:quote "cat"))
            (r:* (r:quote "dog"))
            (r:+ (r:quote "cat")))
            "tests.txt"))
("[09]. catdogcat" 
 "[10]. catcatdogdog"
 "[12]. catcatcatdogdogdog"
 "[13]. acatdogdogcats")
;Unspecified return value

(pp (r:grep 
     (r:seq (r:+ (r:quote "dog"))
            (r:* (r:quote "cat"))
            (r:+ (r:quote "dog")))
            "tests.txt"))
("[10]. catcatdogdog" 
 "[11]. dogdogcatdogdog"
 "[12]. catcatcatdogdogdog"
 "[13]. acatdogdogcats"
 "[14]. ifacatdogdogs"
 "[15]. acatdogdogsme")
;Unspecified return value

(pp (r:grep 
     (let ((digit
            (r:char-from (string->char-set "0123456789"))))
       (r:seq (r:bol)
              (r:quote "[")
              digit
              digit
              (r:quote "]")
              (r:quote ".")
              (r:quote " ")
              (r:* (r:alt "cat" "dog"))
              (r:eol)))
     "tests.txt"))
("[09]. catdogcat"
 "[10]. catcatdogdog"
 "[11]. dogdogcatdogdog"
 "[12]. catcatcatdogdogdog")
;Unspecified return value
|#

;; Problem 1.2
#|

a. Louis' idea would trigger an infinite loop on (r:repeat 0 1 expr)
   as (r:repeat 0 1 expr) would call (r:repeat 0 1 expr).

b. Bonnie's proposal has the following advantages over Alyssa's:

   - The data produced is typically shorter, because matching n exprs
     makes use of matching n-1 exprs.

   - The code is simpler to write, because there is no need to
     generate a separate alternative for each possible number of 
     matches.

c. Ben's proposal has the advantage of re-using a feature of the
   underlying regular expression language instead of re-implementing 
   this feature. In addition, Ben's proposal relies on a feature 
   available for both BREs and EREs while Alyssa's and Bonnie's 
   proposals rely on a feature only available for EREs.
|#

;; d.

(define (r:repeat min max expr)
  (if (not (exact-nonnegative-integer? min))
      (error "Min must be non-negative integer:" min))
  (if max
      (begin
        (if (not (exact-nonnegative-integer? max))
            (error "Max must be non-negative integer:" max))
        (if (not (<= min max))
            (error "Min not less than max:" min max))))
  (r:seq expr 
         "\\{" 
         (number->string min) 
         "," 
         (if max (number->string max) "")
         "\\}"))

#|

The new procedures work on all the original test cases.
Here are a few more:

(r:repeat 1 #f (r:quote "foo"))
;Value: "\\(\\(foo\\)\\{1,\\}\\)"

(r:repeat 0 3 (r:seq (r:quote "a") (r:quote "b")))
;Value: "\\(\\(\\(a\\)\\(b\\)\\)\\{0,3\\}\\)"

(r:repeat 3 3 (r:alt (r:quote "a") (r:quote "b")))
;Value: "\\(\\(\\(a\\)\\|\\(b\\)\\)\\{3,3\\}\\)"

|#

;; Problem 1.3

#|

A subtle case to watch out for is that we cannot decide at the time
of constructing an expression whether its parentheses are superfluous.
Indeed, (r:seq (r:quote "foo") (r:quote "bar")) generates
\(\(foo\)\(bar\)\) which can be reduced to foobar, while 
(r:* (r:quote "foo")) generates \(\(foo\)\{0,\}\) which is not
the same as \(foo\{0,\}\). So here, we don't know when building
(r:quote "foo") whether to include parentheses or not. We cannot even
make the building of parentheses the responsibility of the parent in
which an expression is included, because in (r:seq (r:alt (r:quote
"foo") (r:quote "baz")) (r:quote "bar")), the r:seq would have to
decide to parenthesize the first sub-expression while in the previous
example, it didn't need to.

I change the intermediary representation of r:seq, r:alt and r:repeat
to a node data structure. I keep the intermediary representation as
strings for regular expressions representing a single character (or
meta-character), so that I don't need to change the procedures r:dot,
r:bol, r:eol, r:char-from, r:char-not-from. For r:quote, I only call
r:seq if the quoted string consists of more than one character.

The node data structure is a list '(node ,type ,children ,compiler),
where children is a list of children node structures and compiler is a
procedure that takes in a list of the compiled children (i.e. a list
of strings) and return a string that represents the compiled
node. Having the node structure carry its own compiler makes it easier
to compile a node, because I don't need to do a dispatch on the node
type.

|#

(define (make-node type children compiler)
  (list 'node type children compiler))
(define (node? x)
  (and (pair? x) (eq? (car x) 'node)))
(define node-type cadr)
(define node-children caddr)
(define node-compiler cadddr)

(define (r:seq . exprs)
  (define (compiler sexprs)
    (apply string-append sexprs))
  (make-node 'r:seq exprs compiler))

(define (r:alt . exprs)
  (define (compiler sexprs)
    (if (pair? sexprs)
        (apply string-append
               (cons (car sexprs)
                     (append-map (lambda (sexpr)
                                   (list "\\|" sexpr))
                                 (cdr sexprs))))
      (string-append)))
  (make-node 'r:alt exprs compiler))

(define (r:repeat min max expr)
  (define (compiler sexprs)
    (let ((sexpr (car sexprs)))
      (string-append
       sexpr 
       "\\{" 
       (number->string min) 
       "," 
       (if max (number->string max) "")
       "\\}")))
  (if (not (exact-nonnegative-integer? min))
      (error "Min must be non-negative integer:" min))
  (if max
      (begin
        (if (not (exact-nonnegative-integer? max))
            (error "Max must be non-negative integer:" max))
        (if (not (<= min max))
            (error "Min not less than max:" min max))))
  (make-node 'r:repeat (list expr) compiler))

;; only call r:seq if the length of the string is > 1
(define (r:quote string)
  (let ((end (string-length string)))
    (let ((res
           (call-with-output-string        ; see RefMan section 14.3
            (lambda (port)
              (do ((i 0 (+ i 1))) ((not (< i end))) ; see RefMan 2.9
                (let ((c (string-ref string i)))
                  (if (or (char=? c #\.)
                          (char=? c #\[)
                          (char=? c #\\)
                          (char=? c #\^)
                    (char=? c #\$)
                    (char=? c #\*))
                      (write-char #\\ port))
                  (write-char c port)))))))
      (if (> end 1)
          (r:seq res)
          res))))

(define (%parenthesize? op-current op-parent)
  (define (precedence-rank op)
    (define precedence (list 'r:repeat 'r:seq 'r:alt #f))
    (length (member op precedence)))
  (< (precedence-rank op-current) (precedence-rank op-parent)))

(define (%maybe-parenthesize ok sexpr)
  (if ok
      (string-append "\\(" sexpr "\\)")
      sexpr))

(define (compile expr)
  (define (compile-children-within exprs parent)
    (map (lambda (expr)
           (compile-within expr parent))
         exprs))
  (define (compile-within expr parent)
    (if (not (node? expr))
        expr
        (let* ((current (node-type expr))
               (parenthesize? (%parenthesize? current parent))
               (children (compile-children-within
                          (node-children expr)
                          current))
               (sexpr ((node-compiler expr) children)))
          (%maybe-parenthesize parenthesize? sexpr))))
  (compile-within expr #f))

;; first compile internal representation
(define (r:grep-like program options iexpr filename)
  (let ((expr (compile iexpr))
        (port (open-output-string)))
    (and (= (run-synchronous-subprocess program
              (append options
                      (list "-e" expr (->namestring filename)))
              'output port)
            0)
         (r:split-lines (get-output-string port)))))

#|

The new version works on all the original test cases.
It handles the subtleties:

(display (compile (r:seq (r:quote "foo") (r:quote "bar"))))
foobar
;Unspecified return value

(display (compile (r:* (r:quote "foo"))))
\(foo\)\{0,\}
;Unspecified return value

(display (compile
 (r:seq (r:alt (r:quote "foo") (r:quote "baz")) (r:quote "bar"))))
\(foo\|baz\)bar
;Unspecified return value

|#

;; Problem 1.4

#|

I support back-references with the procedures (r:label key expr) and
(r:ref key).

|#

;; labels the given expr with the given key 
;; and evaluates to the given expr 
;; overwrites key if already used
(define (r:label key expr)
  (make-label key expr))
(define (make-label key expr)
  (list 'label key expr))
(define (label? x)
  (and (pair? x) (eq? (car x) 'label)))
(define label-key cadr)
(define label-expr caddr)

;; evaluates to the last expression labelled with the given key
(define (r:ref key)
  (make-ref key))
(define (make-ref key)
  (list 'ref key))
(define (ref? x)
  (and (pair? x) (eq? (car x) 'ref)))
(define ref-key cadr)

;; change the compiler to support labels and references
;; need to count parentheses and keep track of labels
(define (compile expr)
  (define (make-sub data counter labels)
    (list data counter labels))
  (define sub-data car)
  (define sub-counter cadr)
  (define sub-labels caddr)
  (define (compile-children-within exprs counter labels parent)
    (if (null? exprs)
        (make-sub '() counter labels)
        (let* ((sub1 (compile-within 
                      (car exprs) counter labels parent))
               (subr (compile-children-within 
                      (cdr exprs)
                      (sub-counter sub1)
                      (sub-labels sub1)
                      parent)))
          (make-sub (cons (sub-data sub1) (sub-data subr))
                    (sub-counter subr)
                    (sub-labels subr)))))
  (define (compile-within expr counter labels parent)
    (if (not (node? expr))
        (cond ((label? expr)
               (let ((new-counter (+ counter 1)))
                 (if (> new-counter 9)
                     (error 
                      "cannot label beyond digits" (label-key expr))
                     (let ((sub (compile-within 
                                 (label-expr expr) 
                                 new-counter
                                 (cons (list (label-key expr) new-counter) 
                                       labels)
                                 #f)))
                       (make-sub
                        (%maybe-parenthesize #t (sub-data sub))
                        (sub-counter sub)
                        (sub-labels sub))))))
              ((ref? expr)
               (let ((key-val (assoc (ref-key expr) labels)))
                 (if (not key-val)
                     (error "invalid reference" (ref-key expr))
                     (make-sub (string-append 
                                "\\" 
                                (number->string (cadr key-val)))
                               counter
                               labels))))
              (else
               (make-sub expr counter labels)))
        (let* ((current (node-type expr))
               (parenthesize? (%parenthesize? current parent))
               (children-sub (compile-children-within
                              (node-children expr)
                              (if parenthesize? 
                                  (+ counter 1) 
                                  counter)
                              labels
                              current))
               (sexpr ((node-compiler expr) 
                       (sub-data children-sub))))
          (make-sub (%maybe-parenthesize parenthesize? sexpr)
                    (sub-counter children-sub)
                    (sub-labels children-sub)))))
  (sub-data (compile-within expr 0 '() #f)))

#|

The new version works on all the original test cases.
Specific test cases:

(display (compile (r:seq (r:label 'f (r:quote "f")) (r:ref 'f))))
\(f\)\1
;Unspecified return value

(display (compile
          (r:seq (r:label 'animal 
                          (r:alt (r:quote "cat") 
                                 (r:quote "dog"))) 
                 (r:ref 'animal))))
\(cat\|dog\)\1
;Unspecified return value

(display (compile
          (let ((animal (r:alt 
                         (r:quote "cat")
                         (r:quote "dog"))))
            (r:seq (r:* animal)
                   (r:label 'animal animal)
                   (r:ref 'animal)))))
\(cat\|dog\)\{0,\}\(cat\|dog\)\2
;Unspecified return value

(pp (r:grep 
     (r:seq (r:label 'animal 
                     (r:alt (r:quote "cat") 
                            (r:quote "dog"))) 
            (r:ref 'animal))
     "tests.txt"))
("[10]. catcatdogdog" 
"[11]. dogdogcatdogdog"
"[12]. catcatcatdogdogdog"
"[13]. acatdogdogcats"
"[14]. ifacatdogdogs"
"[15]. acatdogdogsme")
;Unspecified return value

(pp (r:grep
     (let ((animal (r:alt 
                         (r:quote "cat")
                         (r:quote "dog"))))
            (r:seq (r:* animal)
                   (r:label 'animal animal)
                   (r:ref 'animal)))
     "tests.txt"))
("[10]. catcatdogdog" 
"[11]. dogdogcatdogdog"
"[12]. catcatcatdogdogdog"
"[13]. acatdogdogcats"
"[14]. ifacatdogdogs"
"[15]. acatdogdogsme")
;Unspecified return value

(pp (r:grep 
     (r:seq (r:label 'animal 
                     (r:alt (r:quote "cat") 
                            (r:quote "dog"))) 
            (r:ref 'animal)
            (r:ref 'animal))
     "tests.txt"))
("[12]. catcatcatdogdogdog")
;Unspecified return value


|#

;; Problem 1.5

#|

a. BREs and EREs significantly differ in that the metacharacters 
   {, }, |, (, and ) are special in BREs when backslashed and
   special in EREs when NOT backslashed. Here is a summary:

   metacharacter...       BREs          EREs
   ... backslashed        special       not special
   ... not backslashed    not special   special

   This is a pain, because r:quote and %char-from need to know 
   whether to generate a BRE or a ERE, as, in case of a ERE, the
   metacharacters need to be backslashed in order to loose their
   special meaning.

b. I refactor the backend so that the primitives and compilers,
   instead of just directly returning a string, return a procedure 
   that takes in the back end and returns the string of the regular 
   expression for the requested backend.

|#

;; c.

(define metacharacters (list #\{ #\} #\| #\( #\)))

(define (r:dot) (lambda (basic?) "."))

(define (r:bol) (lambda (basic?) "^"))

(define (r:eol) (lambda (basic?) "$"))

(define (r:quote string)
  (let ((end (string-length string)))
    (let 
        ((res
          (lambda (basic?)
            (call-with-output-string ; see RefMan section 14.3
             (lambda (port)
               (do ((i 0 (+ i 1))) ((not (< i end))) ; see RefMan 2.9
                 (let ((c (string-ref string i)))
                   (if (or (char=? c #\.)
                           (char=? c #\[)
                           (char=? c #\\)
                           (char=? c #\^)
                           (char=? c #\$)
                           (char=? c #\*))
                       (write-char #\\ port)
                       (if (and (not basic?)
                                (member c metacharacters))
                           (write-char #\\ port)))
                   (write-char c port))))))))
      (if (> end 1)
          (r:seq res)
          res))))

(define (%char-from negate? members)
  (define (escape-meta basic? others)
    (if basic?
        others
        (if (null? others)
            '()
            (let ((rest (cons (car others) 
                              (escape-meta basic? (cdr others)))))
              (if (member (car others) metacharacters)
                  (cons #\\ rest)
                  rest)))))
  (let ((right? (memv #\] members))
        (caret? (memv #\^ members))
        (hyphen? (memv #\- members))
        (others
         (delete-matching-items members
                                (lambda (c)
                                  (or (char=? c #\])
                                      (char=? c #\^)
                                      (char=? c #\-))))))
    (lambda (basic?)
      (if (and caret?
               hyphen?
               (not right?)
               (not negate?)
               (null? others))
          "[-^]"
          (string-append "["
                         (if negate? "^" "")
                         (if right? "]" "")
                         (list->string (escape-meta basic? others))
                         (if caret? "^" "")
                         (if hyphen? "-" "")
                         "]")))))

(define (r:seq . exprs)
  (define (compiler sexprs)
    (lambda (basic?) (apply string-append sexprs)))
  (make-node 'r:seq exprs compiler))

(define (r:alt . exprs)
  (define (compiler sexprs)
    (lambda (basic?)
      (if (pair? sexprs)
          (apply string-append
                 (cons (car sexprs)
                       (append-map (lambda (sexpr)
                                     (list (if basic?
                                               "\\|" 
                                               "|")
                                           sexpr))
                                   (cdr sexprs))))
          (string-append))))
  (make-node 'r:alt exprs compiler))

(define (r:repeat min max expr)
  (define (compiler sexprs)
    (lambda (basic?)
      (let ((sexpr (car sexprs)))
        (string-append
         sexpr 
         (if basic? "\\{" "{")
         (number->string min) 
         "," 
         (if max (number->string max) "")
         (if basic? "\\}" "}")))))
  (if (not (exact-nonnegative-integer? min))
      (error "Min must be non-negative integer:" min))
  (if max
      (begin
        (if (not (exact-nonnegative-integer? max))
            (error "Max must be non-negative integer:" max))
        (if (not (<= min max))
            (error "Min not less than max:" min max))))
  (make-node 'r:repeat (list expr) compiler))

(define (%maybe-parenthesize basic? ok sexpr)
  (if ok
      (string-append 
       (if basic? "\\(" "(")
       sexpr 
       (if basic? "\\)" ")"))
      sexpr))

(define (compile basic? expr)
  (define (make-sub data counter labels)
    (list data counter labels))
  (define sub-data car)
  (define sub-counter cadr)
  (define sub-labels caddr)
  (define (compile-children-within exprs counter labels parent)
    (if (null? exprs)
        (make-sub '() counter labels)
        (let* ((sub1 (compile-within 
                      (car exprs) counter labels parent))
               (subr (compile-children-within 
                      (cdr exprs)
                      (sub-counter sub1)
                      (sub-labels sub1)
                      parent)))
          (make-sub (cons (sub-data sub1) (sub-data subr))
                    (sub-counter subr)
                    (sub-labels subr)))))
  (define (compile-within expr counter labels parent)
    (if (not (node? expr))
        (cond ((label? expr)
               (let ((new-counter (+ counter 1)))
                 (if (> new-counter 9)
                     (error 
                      "cannot label beyond digits" (label-key expr))
                     (let ((sub (compile-within 
                                 (label-expr expr) 
                                 new-counter
                                 (cons 
                                  (list (label-key expr) new-counter)
                                  labels)
                                 #f)))
                       (make-sub
                        (%maybe-parenthesize 
                         basic? #t (sub-data sub))
                        (sub-counter sub)
                        (sub-labels sub))))))
              ((ref? expr)
               (let ((key-val (assoc (ref-key expr) labels)))
                 (if (not key-val)
                     (error "invalid reference" (ref-key expr))
                     (make-sub (string-append 
                                "\\" 
                                (number->string (cadr key-val)))
                               counter
                               labels))))
              (else
               (make-sub (expr basic?) counter labels)))
        (let* ((current (node-type expr))
               (parenthesize? (%parenthesize? current parent))
               (children-sub (compile-children-within
                              (node-children expr)
                              (if parenthesize? 
                                  (+ counter 1) 
                                  counter)
                              labels
                              current))
               (sexpr (((node-compiler expr) 
                        (sub-data children-sub))
                       basic?)))
          (make-sub (%maybe-parenthesize basic? parenthesize? sexpr)
                    (sub-counter children-sub)
                    (sub-labels children-sub)))))
  (sub-data (compile-within expr 0 '() #f)))

(define (r:grep expr filename)
  (r:grep-like #t "grep" '() expr filename))

(define (r:egrep expr filename)
  (if (eq? microcode-id/operating-system 'nt)
      (r:grep-like #f "grep" '("-E") expr filename)
      (r:grep-like #f "egrep" '() expr filename)))

(define (r:grep-like basic? program options iexpr filename)
  (let ((expr (compile basic? iexpr))
        (port (open-output-string)))
    (and (= (run-synchronous-subprocess program
              (append options
                      (list "-e" expr (->namestring filename)))
              'output port)
            0)
         (r:split-lines (get-output-string port)))))

#|

Original test cases work, except that the base strings in the handout
must be quoted:

(pp
 (r:grep 
  (r:seq (r:quote " ")
         (r:repeat 3 5 (r:alt (r:quote "cat") (r:quote "dog")))
         (r:eol)) 
  "tests.txt"))
("[09]. catdogcat" "[10]. catcatdogdog" "[11]. dogdogcatdogdog")
;Unspecified return value

(pp
 (r:grep
  (let ((digit 
         (r:char-from (string->char-set "0123456789"))))
    (r:seq (r:bol)
           (r:quote "[")
           digit
           digit
           (r:quote "]")
           (r:quote ".")
           (r:quote " ")
           (r:char-from (char-set #\a #\b))
           (r:repeat 3 5 (r:alt (r:quote "cat") (r:quote "dog")))
           (r:char-not-from (char-set #\d #\e #\f))
           (r:eol)))
  "tests.txt"))
("[13]. acatdogdogcats")
;Unspecified return value

(pp (r:egrep (r:alt (r:quote "foo") (r:quote "bar") (r:quote "baz"))
             "tests.txt"))
("[05]. foo" "[06]. bar" "[07]. foo bar baz quux")
;Unspecified return value


(pp (r:egrep (r:repeat 3 5 (r:alt (r:quote "cat") (r:quote "dog")))
            "tests.txt"))
("[09]. catdogcat"
 "[10]. catcatdogdog"
 "[11]. dogdogcatdogdog"
 "[12]. catcatcatdogdogdog"
 "[13]. acatdogdogcats"
 "[14]. ifacatdogdogs"
 "[15]. acatdogdogsme")
;Unspecified return value

(pp
 (r:egrep 
  (r:seq (r:quote " ")
         (r:repeat 3 5 (r:alt (r:quote "cat") (r:quote "dog")))
         (r:eol)) 
         "tests.txt"))
("[09]. catdogcat" "[10]. catcatdogdog" "[11]. dogdogcatdogdog")
;Unspecified return value

(pp
 (r:egrep
  (let ((digit 
         (r:char-from (string->char-set "0123456789"))))
    (r:seq (r:bol)
           (r:quote "[")
           digit
           digit
           (r:quote "]")
           (r:quote ".")
           (r:quote " ")
           (r:char-from (char-set #\a #\b))
           (r:repeat 3 5 (r:alt (r:quote "cat") (r:quote "dog")))
           (r:char-not-from (char-set #\d #\e #\f))
           (r:eol)))
  "tests.txt"))
("[13]. acatdogdogcats")
;Unspecified return value

Here are additional test cases:

(display 
 (compile #t
          (r:quote "hello|")))
hello|
;Unspecified return value

(display 
 (compile #f
          (r:quote "hello|")))
hello\|
;Unspecified return value

I added the test case
[16]. hello|bye

(pp (r:grep
     (r:quote "hello|")
    "tests.txt"))
("[16]. hello|bye")
;Unspecified return value

(pp (r:egrep
     (r:quote "hello|")
    "tests.txt"))
("[16]. hello|bye")
;Unspecified return value

(display 
 (compile #t
          (r:seq (r:label 'f (r:quote "f")) (r:ref 'f))))
\(f\)\1
;Unspecified return value

(display 
 (compile #f
          (r:seq (r:label 'f (r:quote "f")) (r:ref 'f))))
(f)\1
;Unspecified return value

(pp (r:egrep 
     (r:seq (r:label 'animal 
                     (r:alt (r:quote "cat") 
                            (r:quote "dog"))) 
            (r:ref 'animal))
     "tests.txt"))
("[10]. catcatdogdog" 
 "[11]. dogdogcatdogdog"
 "[12]. catcatcatdogdogdog"
 "[13]. acatdogdogcats"
 "[14]. ifacatdogdogs"
 "[15]. acatdogdogsme")
;Unspecified return value

|#