Code: Select all
;; Functions:
;;
;; (reg x y ...) translate and join epressions
;; (bracket s c) bracket expressions with parens
;; (r1 x) match 1 or more times +
;; (r01 x) match 0 or 1 times ?
;; (r0 x) match 0 or more times *
;; (r2 x) match 2 or more times {2,}
;; (ntom x n m) match a min of n and a max of m {n,m}
;; (lazy x) perform lazy (non-greedy) evaluation
;; (r-n x) negate character class or individual character
;; (r-or x y) alternation (x|y)
;; (r-run x y) [a-z] run of characters
;; (r-chc x) [abc] character class
;; Bracket a string with a given bracket character. If no character than parentheses () are
;; assumed. For regex expressions the function acts as the grouping operator ().
(define (bracket s c , opt)
(setq c (or c "(")
opt (if (member c '("(" ")")) '("(" ")")
(member c '("[" "]")) '("[" "]")
(member c '("<" ">")) '("<" ">")
'("{" "}")
)
)
(join (list (first opt) s (last opt))))
;; Convert numbers or symbols to appropriate character strings. Note that character
;; classes or special characters are denoted by symbols. Ascii codes are typed in
;; directly as integers.
(define (convert-codes c)
(if ;c is an ascii code
(= c 7) "\\a" ;bell
(= c 8) "\\b" ;backspace
(= c 9) "\\t" ;tab
(= c 10) "\\n" ;newline
(= c 12) "\\f" ;formfeed
(= c 13) "\\r" ;carriage return
(= c 27) "\\e" ;escape
(or (<= 48 c 57)(<= 65 c 90)(<= 97 c 122)) (char c) ;alphabet or digits
;c is a symbol
(= c 'a) "\\a" ;bell
(= c 'b) "\\b" ;backspace
(= c 't) "\\t" ;tab
(= c 'n) "\\n" ;newline
(= c 'f) "\\f" ;formfeed
(= c 'r) "\\r" ;carriage return
(= c 'e) "\\e" ;escape
(= c '^) "^" ;anchor to start of line
(= c '$) "$" ;anchor to end of line
(= c 'L) "[a-z]" ;any lower case letter
(= c 'U) "[A-Z]" ;any upper case letter
(= c 'UL) "[^A-Za-z]" ;anything other than an upper or lower case letter
(= c 'ul) "[A-Za-z]" ;any lower or upper case letter
(= c 'V) "[^aeiouAEIOU]" ;any consonant
(= c 'v) "[aeiouyAEIOUY]" ;any vowel
(= c 'br) "[()\[\]{}<>]" ;any bracket symbol
(= c 'lb) "[(\[{<]" ;any left bracket symbol
(= c 'rb) "[)\]}>]" ;any right bracket symbol
(= c 'BR) "[^()\[\]{}<>]" ;any non-bracket symbol
(= c 'W) "\\W" ;matches a non-alphanumeric character
(= c 'w) "\\w" ;matches an alphanumeric character
(= c 'D) "\\D" ;non-digit
(= c 'd) "\\d" ;digit
(= c 'S) "\\S" ;non-space
(= c 's) "\\s" ;space
(= c 'se) "[.?!]" ;a sentence ending symbol
(= c 'P) "[^,;:.?! ]" ;a non sentence punctuation symbol
(= c 'p) "[,::.?! ]" ;a sentence punctuation symbol
(= c '.) "." ;any single character
(= c '*) ".*" ;one or more of any character
(= c 'rp) ".{2,}" ;2 or more of any character (repeats)
(= c 'K) "[^!-~]" ;is not a key character
(= c 'k) "[!-~]" ;is a key character
(= c '+) "[-+]" ;matches + or -
c
)
)
;; Convert a list of strings, ascii codes, symbols or other lists into
;; a single string.
(define (reg%% z)
(if (string? z) z
(or (integer? z)(symbol? z)) (convert-codes z)
(list? z)(join (map reg%% z))
nil
))
;; Convert all arguments into strings and join them into a single string.
(define-macro (reg)
(join (map reg%% (map eval (args)))))
;; Take the negation of a string pattern.
(define (r-n x)
(setq x (reg x))
(if (= "^" (first x)) (rest x)
(= "[^" (0 2 x)) (join (list "[" (slice x 2)))
(= "[" (first x)) (join (list "[^" (rest x)))
(= "^(" (0 2 x)) (rest x)
(join (list "^" x))
))
;; Match 1 or more of x. If L is non-nil do a lazy (non-greedy) analysis.
;; Example: (r1 "a") -> "a+"
;; (r1 "a" true) -> "a+?"
(define (r1 x L)
(join (list (reg x) "+" (if L "?" ""))))
;; Match 0 or more of x. If L is non-nil do a lazy (non-greedy) analysis.
;; Example: (r0 "a") -> "a*"
;; (r0 "a" true) -> "a*?"
(define (r0 x L)
(join (list (reg x) "*" (if L "?" ""))))
;; Match 0 or 1 of x. If L is non-nil do a lazy (non-greedy) analysis.
;; Example: (r01 "a") -> "a?"
;; (r01 "a" true) -> "a??"
(define (r01 x L)
(join (list (reg x) "?" (if L "?" ""))))
;; Match 2 or more i.e. repeats of x. If L is non-nil do a lazy (non-greedy) analysis.
;; Example: (r2 "a") -> "a{2,}"
;; (r2 "a" true) -> "a{2,}?"
(define (r2 x L)
(join (list (reg x) "{2,}" (if L "?" ""))))
;; Another way of doing lazy evaluation by having a function specifically for it.
;; The function takes a string and appends "?" onto the end of it. So you can
;; write (r1 "A" true) as (lazy (r1 "A")) if you wish. This might be handier if you
;; wanted to use map to map lazy onto several expressions at once.
(define (lazy x)(join (list (reg x) "?")))
;; Match x a minimum of n times and a maximum of m times. If L is non-nil
;; do a lazy (non-greedy) analysis. If m is 0 then the maximum is infinity.
;; Example: (ntom "a" 2 4) -> "a{2,4}"
;; (ntom "a" 2 0) -> "a{2,}"
;; (ntom "a" 2) -> "a{2}"
;; (ntom "a" 2 4 true) -> "a{2,4}?"
(define (ntom x n m L)
(join (list (reg x)
(bracket (join (list (string n)
(if (>= m 0) "," "")
(if (> m 0) (string m) "")
)) "}")
(if L "?" "")
)))
;; Perform alternation on a series of arguments. If the last argument is +
;; do a positive lookahead, if the last argument is - do a negative lookahead.
;; Example: (r-or "cat" "dog") -> "(cat|dog)"
;; (r-or "cat" "dog" +) -> "(?=cat|dog)"
;; (r-or "cat" "dog" -) -> "(?!cat|dog)"
(define-macro (r-or)
(setq L (last (args))
interior (map (fn (x)(join (list "|" (reg x)))) (args))
interior (rest (join interior))
)
(bracket (join (list (if (= L +) "?=" (= L -) "?!" "") interior)))
)
;; Create a run of characters. You must backslash
;; a character for it to be taken literally.
;; Example: (r-run "a" "b") -> "[a-b]"
(define (r-run s f)
(bracket
(join (list (reg s)
"-"
(reg f))) "["))
;; Create a character class. You must backslash a character
;; for it to be taken literally.
;; Example: (r-chc "f9p") -> "[f9p]"
(define (r-chc s)(bracket (reg s) "]"))
"cat" - a single string just remains a single string
"^cat" - (reg '^ "cat")
"(cat|dog)" - (r-or "cat" "dog")
"[a9fr]" - (r-chc "a9fr")
"[^b-t]" - (r-n (r-run "b" "t")) we use r-n to negate the class
"[b-tamcd]" - (reg (r-run "b" "t")(r-chc "amcd"))
"[-+]?[0-9]" - (reg (r01 '+) 'd)
"([-+]?[0-9])+" - (r1 (bracket (reg (r01 '+) 'd)))
Perhaps you are getting the idea now? The REG function is used for glueing multiple expressions together into a single string, it translates and then concatenates. The BRACKET function is used for grouping, it is the regex equivalent of LIST.
If you take the time to look at the CONVERT-CODES function you will notice that I have taken the liberty of adding several shorthands for various character classes that I think are useful. Perhaps there are suggestions for more or less?
Here are the advantages that I think justify this approach:
1. A consistent syntax instead of myriad rules to remember.
2. It adds extra options for the programmer while still allowing standard notation to be used if desired.
3. Other NewLISP functions can be used on parts of the expression providing untold power to manipulate expressions in ways not thought of before. For instance, instead of normal backtracking you could use the SETQ function to set part of the expression to a variable that you call later in the expression.
4. More readable? Debateable of course but I think that the larger the expression the more readable it will be in functional format instead of comic book swearing typology.
Disadvantages:
1. Unfamiliar notation.
2. Short expressions can be more wordy.
I am hoping that by providing an example of this concept that it will stimulate some thought and discussion on the issue. Ideas? Suggestions?