(define (r:* expr)
(r:repeat 0 #f expr))
(define (r:+ expr)
(r:repeat 1 #f expr))
Test cases:
(pp (r:grep
(r:seq (r:+ (r:quote "cat"))
(r:* (r:quote "dog"))
(r:+ (r:quote "cat")))
"tests.txt"))
("[09]. catdogcat"
"[10]. catcatdogdog"
"[12]. catcatcatdogdogdog"
"[13]. acatdogdogcats")
(pp (r:grep
(r:seq (r:+ (r:quote "dog"))
(r:* (r:quote "cat"))
(r:+ (r:quote "dog")))
"tests.txt"))
("[10]. catcatdogdog"
"[11]. dogdogcatdogdog"
"[12]. catcatcatdogdogdog"
"[13]. acatdogdogcats"
"[14]. ifacatdogdogs"
"[15]. acatdogdogsme")
(pp (r:grep
(let ((digit
(r:char-from (string->char-set "0123456789"))))
(r:seq (r:bol)
(r:quote "[")
digit
digit
(r:quote "]")
(r:quote ".")
(r:quote " ")
(r:* (r:alt "cat" "dog"))
(r:eol)))
"tests.txt"))
("[09]. catdogcat"
"[10]. catcatdogdog"
"[11]. dogdogcatdogdog"
"[12]. catcatcatdogdogdog")
|#
a. Louis' idea would trigger an infinite loop on (r:repeat 0 1 expr)
as (r:repeat 0 1 expr) would call (r:repeat 0 1 expr).
b. Bonnie's proposal has the following advantages over Alyssa's:
- The data produced is typically shorter, because matching n exprs
makes use of matching n-1 exprs.
- The code is simpler to write, because there is no need to
generate a separate alternative for each possible number of
matches.
c. Ben's proposal has the advantage of re-using a feature of the
underlying regular expression language instead of re-implementing
this feature. In addition, Ben's proposal relies on a feature
available for both BREs and EREs while Alyssa's and Bonnie's
proposals rely on a feature only available for EREs.
|#
(define (r:repeat min max expr)
(if (not (exact-nonnegative-integer? min))
(error "Min must be non-negative integer:" min))
(if max
(begin
(if (not (exact-nonnegative-integer? max))
(error "Max must be non-negative integer:" max))
(if (not (<= min max))
(error "Min not less than max:" min max))))
(r:seq expr
"\\{"
(number->string min)
","
(if max (number->string max) "")
"\\}"))
The new procedures work on all the original test cases.
Here are a few more:
(r:repeat 1 #f (r:quote "foo"))
(r:repeat 0 3 (r:seq (r:quote "a") (r:quote "b")))
(r:repeat 3 3 (r:alt (r:quote "a") (r:quote "b")))
|#
A subtle case to watch out for is that we cannot decide at the time
of constructing an expression whether its parentheses are superfluous.
Indeed, (r:seq (r:quote "foo") (r:quote "bar")) generates
\(\(foo\)\(bar\)\) which can be reduced to foobar, while
(r:* (r:quote "foo")) generates \(\(foo\)\{0,\}\) which is not
the same as \(foo\{0,\}\). So here, we don't know when building
(r:quote "foo") whether to include parentheses or not. We cannot even
make the building of parentheses the responsibility of the parent in
which an expression is included, because in (r:seq (r:alt (r:quote
"foo") (r:quote "baz")) (r:quote "bar")), the r:seq would have to
decide to parenthesize the first sub-expression while in the previous
example, it didn't need to.
I change the intermediary representation of r:seq, r:alt and r:repeat
to a node data structure. I keep the intermediary representation as
strings for regular expressions representing a single character (or
meta-character), so that I don't need to change the procedures r:dot,
r:bol, r:eol, r:char-from, r:char-not-from. For r:quote, I only call
r:seq if the quoted string consists of more than one character.
The node data structure is a list '(node ,type ,children ,compiler),
where children is a list of children node structures and compiler is a
procedure that takes in a list of the compiled children (i.e. a list
of strings) and return a string that represents the compiled
node. Having the node structure carry its own compiler makes it easier
to compile a node, because I don't need to do a dispatch on the node
type.
|#
(define (make-node type children compiler)
(list 'node type children compiler))
(define (node? x)
(and (pair? x) (eq? (car x) 'node)))
(define node-type cadr)
(define node-children caddr)
(define node-compiler cadddr)
(define (r:seq . exprs)
(define (compiler sexprs)
(apply string-append sexprs))
(make-node 'r:seq exprs compiler))
(define (r:alt . exprs)
(define (compiler sexprs)
(if (pair? sexprs)
(apply string-append
(cons (car sexprs)
(append-map (lambda (sexpr)
(list "\\|" sexpr))
(cdr sexprs))))
(string-append)))
(make-node 'r:alt exprs compiler))
(define (r:repeat min max expr)
(define (compiler sexprs)
(let ((sexpr (car sexprs)))
(string-append
sexpr
"\\{"
(number->string min)
","
(if max (number->string max) "")
"\\}")))
(if (not (exact-nonnegative-integer? min))
(error "Min must be non-negative integer:" min))
(if max
(begin
(if (not (exact-nonnegative-integer? max))
(error "Max must be non-negative integer:" max))
(if (not (<= min max))
(error "Min not less than max:" min max))))
(make-node 'r:repeat (list expr) compiler))
(define (r:quote string)
(let ((end (string-length string)))
(let ((res
(call-with-output-string (lambda (port)
(do ((i 0 (+ i 1))) ((not (< i end))) (let ((c (string-ref string i)))
(if (or (char=? c #\.)
(char=? c #\[)
(char=? c #\\)
(char=? c #\^)
(char=? c #\$)
(char=? c #\*))
(write-char #\\ port))
(write-char c port)))))))
(if (> end 1)
(r:seq res)
res))))
(define (%parenthesize? op-current op-parent)
(define (precedence-rank op)
(define precedence (list 'r:repeat 'r:seq 'r:alt #f))
(length (member op precedence)))
(< (precedence-rank op-current) (precedence-rank op-parent)))
(define (%maybe-parenthesize ok sexpr)
(if ok
(string-append "\\(" sexpr "\\)")
sexpr))
(define (compile expr)
(define (compile-children-within exprs parent)
(map (lambda (expr)
(compile-within expr parent))
exprs))
(define (compile-within expr parent)
(if (not (node? expr))
expr
(let* ((current (node-type expr))
(parenthesize? (%parenthesize? current parent))
(children (compile-children-within
(node-children expr)
current))
(sexpr ((node-compiler expr) children)))
(%maybe-parenthesize parenthesize? sexpr))))
(compile-within expr #f))
(define (r:grep-like program options iexpr filename)
(let ((expr (compile iexpr))
(port (open-output-string)))
(and (= (run-synchronous-subprocess program
(append options
(list "-e" expr (->namestring filename)))
'output port)
0)
(r:split-lines (get-output-string port)))))
The new version works on all the original test cases.
It handles the subtleties:
(display (compile (r:seq (r:quote "foo") (r:quote "bar"))))
foobar
(display (compile (r:* (r:quote "foo"))))
\(foo\)\{0,\}
(display (compile
(r:seq (r:alt (r:quote "foo") (r:quote "baz")) (r:quote "bar"))))
\(foo\|baz\)bar
|#
I support back-references with the procedures (r:label key expr) and
(r:ref key).
|#
(define (r:label key expr)
(make-label key expr))
(define (make-label key expr)
(list 'label key expr))
(define (label? x)
(and (pair? x) (eq? (car x) 'label)))
(define label-key cadr)
(define label-expr caddr)
(define (r:ref key)
(make-ref key))
(define (make-ref key)
(list 'ref key))
(define (ref? x)
(and (pair? x) (eq? (car x) 'ref)))
(define ref-key cadr)
(define (compile expr)
(define (make-sub data counter labels)
(list data counter labels))
(define sub-data car)
(define sub-counter cadr)
(define sub-labels caddr)
(define (compile-children-within exprs counter labels parent)
(if (null? exprs)
(make-sub '() counter labels)
(let* ((sub1 (compile-within
(car exprs) counter labels parent))
(subr (compile-children-within
(cdr exprs)
(sub-counter sub1)
(sub-labels sub1)
parent)))
(make-sub (cons (sub-data sub1) (sub-data subr))
(sub-counter subr)
(sub-labels subr)))))
(define (compile-within expr counter labels parent)
(if (not (node? expr))
(cond ((label? expr)
(let ((new-counter (+ counter 1)))
(if (> new-counter 9)
(error
"cannot label beyond digits" (label-key expr))
(let ((sub (compile-within
(label-expr expr)
new-counter
(cons (list (label-key expr) new-counter)
labels)
#f)))
(make-sub
(%maybe-parenthesize #t (sub-data sub))
(sub-counter sub)
(sub-labels sub))))))
((ref? expr)
(let ((key-val (assoc (ref-key expr) labels)))
(if (not key-val)
(error "invalid reference" (ref-key expr))
(make-sub (string-append
"\\"
(number->string (cadr key-val)))
counter
labels))))
(else
(make-sub expr counter labels)))
(let* ((current (node-type expr))
(parenthesize? (%parenthesize? current parent))
(children-sub (compile-children-within
(node-children expr)
(if parenthesize?
(+ counter 1)
counter)
labels
current))
(sexpr ((node-compiler expr)
(sub-data children-sub))))
(make-sub (%maybe-parenthesize parenthesize? sexpr)
(sub-counter children-sub)
(sub-labels children-sub)))))
(sub-data (compile-within expr 0 '() #f)))
The new version works on all the original test cases.
Specific test cases:
(display (compile (r:seq (r:label 'f (r:quote "f")) (r:ref 'f))))
\(f\)\1
(display (compile
(r:seq (r:label 'animal
(r:alt (r:quote "cat")
(r:quote "dog")))
(r:ref 'animal))))
\(cat\|dog\)\1
(display (compile
(let ((animal (r:alt
(r:quote "cat")
(r:quote "dog"))))
(r:seq (r:* animal)
(r:label 'animal animal)
(r:ref 'animal)))))
\(cat\|dog\)\{0,\}\(cat\|dog\)\2
(pp (r:grep
(r:seq (r:label 'animal
(r:alt (r:quote "cat")
(r:quote "dog")))
(r:ref 'animal))
"tests.txt"))
("[10]. catcatdogdog"
"[11]. dogdogcatdogdog"
"[12]. catcatcatdogdogdog"
"[13]. acatdogdogcats"
"[14]. ifacatdogdogs"
"[15]. acatdogdogsme")
(pp (r:grep
(let ((animal (r:alt
(r:quote "cat")
(r:quote "dog"))))
(r:seq (r:* animal)
(r:label 'animal animal)
(r:ref 'animal)))
"tests.txt"))
("[10]. catcatdogdog"
"[11]. dogdogcatdogdog"
"[12]. catcatcatdogdogdog"
"[13]. acatdogdogcats"
"[14]. ifacatdogdogs"
"[15]. acatdogdogsme")
(pp (r:grep
(r:seq (r:label 'animal
(r:alt (r:quote "cat")
(r:quote "dog")))
(r:ref 'animal)
(r:ref 'animal))
"tests.txt"))
("[12]. catcatcatdogdogdog")
|#
a. BREs and EREs significantly differ in that the metacharacters
{, }, |, (, and ) are special in BREs when backslashed and
special in EREs when NOT backslashed. Here is a summary:
metacharacter... BREs EREs
... backslashed special not special
... not backslashed not special special
This is a pain, because r:quote and %char-from need to know
whether to generate a BRE or a ERE, as, in case of a ERE, the
metacharacters need to be backslashed in order to loose their
special meaning.
b. I refactor the backend so that the primitives and compilers,
instead of just directly returning a string, return a procedure
that takes in the back end and returns the string of the regular
expression for the requested backend.
|#
(define metacharacters (list #\{ #\} #\| #\( #\)))
(define (r:dot) (lambda (basic?) "."))
(define (r:bol) (lambda (basic?) "^"))
(define (r:eol) (lambda (basic?) "$"))
(define (r:quote string)
(let ((end (string-length string)))
(let
((res
(lambda (basic?)
(call-with-output-string (lambda (port)
(do ((i 0 (+ i 1))) ((not (< i end))) (let ((c (string-ref string i)))
(if (or (char=? c #\.)
(char=? c #\[)
(char=? c #\\)
(char=? c #\^)
(char=? c #\$)
(char=? c #\*))
(write-char #\\ port)
(if (and (not basic?)
(member c metacharacters))
(write-char #\\ port)))
(write-char c port))))))))
(if (> end 1)
(r:seq res)
res))))
(define (%char-from negate? members)
(define (escape-meta basic? others)
(if basic?
others
(if (null? others)
'()
(let ((rest (cons (car others)
(escape-meta basic? (cdr others)))))
(if (member (car others) metacharacters)
(cons #\\ rest)
rest)))))
(let ((right? (memv #\] members))
(caret? (memv #\^ members))
(hyphen? (memv #\- members))
(others
(delete-matching-items members
(lambda (c)
(or (char=? c #\])
(char=? c #\^)
(char=? c #\-))))))
(lambda (basic?)
(if (and caret?
hyphen?
(not right?)
(not negate?)
(null? others))
"[-^]"
(string-append "["
(if negate? "^" "")
(if right? "]" "")
(list->string (escape-meta basic? others))
(if caret? "^" "")
(if hyphen? "-" "")
"]")))))
(define (r:seq . exprs)
(define (compiler sexprs)
(lambda (basic?) (apply string-append sexprs)))
(make-node 'r:seq exprs compiler))
(define (r:alt . exprs)
(define (compiler sexprs)
(lambda (basic?)
(if (pair? sexprs)
(apply string-append
(cons (car sexprs)
(append-map (lambda (sexpr)
(list (if basic?
"\\|"
"|")
sexpr))
(cdr sexprs))))
(string-append))))
(make-node 'r:alt exprs compiler))
(define (r:repeat min max expr)
(define (compiler sexprs)
(lambda (basic?)
(let ((sexpr (car sexprs)))
(string-append
sexpr
(if basic? "\\{" "{")
(number->string min)
","
(if max (number->string max) "")
(if basic? "\\}" "}")))))
(if (not (exact-nonnegative-integer? min))
(error "Min must be non-negative integer:" min))
(if max
(begin
(if (not (exact-nonnegative-integer? max))
(error "Max must be non-negative integer:" max))
(if (not (<= min max))
(error "Min not less than max:" min max))))
(make-node 'r:repeat (list expr) compiler))
(define (%maybe-parenthesize basic? ok sexpr)
(if ok
(string-append
(if basic? "\\(" "(")
sexpr
(if basic? "\\)" ")"))
sexpr))
(define (compile basic? expr)
(define (make-sub data counter labels)
(list data counter labels))
(define sub-data car)
(define sub-counter cadr)
(define sub-labels caddr)
(define (compile-children-within exprs counter labels parent)
(if (null? exprs)
(make-sub '() counter labels)
(let* ((sub1 (compile-within
(car exprs) counter labels parent))
(subr (compile-children-within
(cdr exprs)
(sub-counter sub1)
(sub-labels sub1)
parent)))
(make-sub (cons (sub-data sub1) (sub-data subr))
(sub-counter subr)
(sub-labels subr)))))
(define (compile-within expr counter labels parent)
(if (not (node? expr))
(cond ((label? expr)
(let ((new-counter (+ counter 1)))
(if (> new-counter 9)
(error
"cannot label beyond digits" (label-key expr))
(let ((sub (compile-within
(label-expr expr)
new-counter
(cons
(list (label-key expr) new-counter)
labels)
#f)))
(make-sub
(%maybe-parenthesize
basic? #t (sub-data sub))
(sub-counter sub)
(sub-labels sub))))))
((ref? expr)
(let ((key-val (assoc (ref-key expr) labels)))
(if (not key-val)
(error "invalid reference" (ref-key expr))
(make-sub (string-append
"\\"
(number->string (cadr key-val)))
counter
labels))))
(else
(make-sub (expr basic?) counter labels)))
(let* ((current (node-type expr))
(parenthesize? (%parenthesize? current parent))
(children-sub (compile-children-within
(node-children expr)
(if parenthesize?
(+ counter 1)
counter)
labels
current))
(sexpr (((node-compiler expr)
(sub-data children-sub))
basic?)))
(make-sub (%maybe-parenthesize basic? parenthesize? sexpr)
(sub-counter children-sub)
(sub-labels children-sub)))))
(sub-data (compile-within expr 0 '() #f)))
(define (r:grep expr filename)
(r:grep-like #t "grep" '() expr filename))
(define (r:egrep expr filename)
(if (eq? microcode-id/operating-system 'nt)
(r:grep-like #f "grep" '("-E") expr filename)
(r:grep-like #f "egrep" '() expr filename)))
(define (r:grep-like basic? program options iexpr filename)
(let ((expr (compile basic? iexpr))
(port (open-output-string)))
(and (= (run-synchronous-subprocess program
(append options
(list "-e" expr (->namestring filename)))
'output port)
0)
(r:split-lines (get-output-string port)))))
Original test cases work, except that the base strings in the handout
must be quoted:
(pp
(r:grep
(r:seq (r:quote " ")
(r:repeat 3 5 (r:alt (r:quote "cat") (r:quote "dog")))
(r:eol))
"tests.txt"))
("[09]. catdogcat" "[10]. catcatdogdog" "[11]. dogdogcatdogdog")
(pp
(r:grep
(let ((digit
(r:char-from (string->char-set "0123456789"))))
(r:seq (r:bol)
(r:quote "[")
digit
digit
(r:quote "]")
(r:quote ".")
(r:quote " ")
(r:char-from (char-set #\a #\b))
(r:repeat 3 5 (r:alt (r:quote "cat") (r:quote "dog")))
(r:char-not-from (char-set #\d #\e #\f))
(r:eol)))
"tests.txt"))
("[13]. acatdogdogcats")
(pp (r:egrep (r:alt (r:quote "foo") (r:quote "bar") (r:quote "baz"))
"tests.txt"))
("[05]. foo" "[06]. bar" "[07]. foo bar baz quux")
(pp (r:egrep (r:repeat 3 5 (r:alt (r:quote "cat") (r:quote "dog")))
"tests.txt"))
("[09]. catdogcat"
"[10]. catcatdogdog"
"[11]. dogdogcatdogdog"
"[12]. catcatcatdogdogdog"
"[13]. acatdogdogcats"
"[14]. ifacatdogdogs"
"[15]. acatdogdogsme")
(pp
(r:egrep
(r:seq (r:quote " ")
(r:repeat 3 5 (r:alt (r:quote "cat") (r:quote "dog")))
(r:eol))
"tests.txt"))
("[09]. catdogcat" "[10]. catcatdogdog" "[11]. dogdogcatdogdog")
(pp
(r:egrep
(let ((digit
(r:char-from (string->char-set "0123456789"))))
(r:seq (r:bol)
(r:quote "[")
digit
digit
(r:quote "]")
(r:quote ".")
(r:quote " ")
(r:char-from (char-set #\a #\b))
(r:repeat 3 5 (r:alt (r:quote "cat") (r:quote "dog")))
(r:char-not-from (char-set #\d #\e #\f))
(r:eol)))
"tests.txt"))
("[13]. acatdogdogcats")
Here are additional test cases:
(display
(compile #t
(r:quote "hello|")))
hello|
(display
(compile #f
(r:quote "hello|")))
hello\|
I added the test case
[16]. hello|bye
(pp (r:grep
(r:quote "hello|")
"tests.txt"))
("[16]. hello|bye")
(pp (r:egrep
(r:quote "hello|")
"tests.txt"))
("[16]. hello|bye")
(display
(compile #t
(r:seq (r:label 'f (r:quote "f")) (r:ref 'f))))
\(f\)\1
(display
(compile #f
(r:seq (r:label 'f (r:quote "f")) (r:ref 'f))))
(f)\1
(pp (r:egrep
(r:seq (r:label 'animal
(r:alt (r:quote "cat")
(r:quote "dog")))
(r:ref 'animal))
"tests.txt"))
("[10]. catcatdogdog"
"[11]. dogdogcatdogdog"
"[12]. catcatcatdogdogdog"
"[13]. acatdogdogcats"
"[14]. ifacatdogdogs"
"[15]. acatdogdogsme")
|#