The problem seems to be with my matching algorithm. It runs through around ten regexp patterns on each name (after 'normalizing' the name) until it hits one that matches. I've arranged the patterns as best as I can to match the most efficiently. However, there is a huge speed difference between the normalization function, which does a bunch of replaces, and the parse-name function, which does a bunch of regex searching.
After running through nearly 8000 names, the python version has about a half a second lead on my quad core g5. They both use nearly the same algorithm to do matching and replacing (in fact, I copied and pasted most of the regexps).
Any ideas?
Code: Select all
#!/usr/bin/env newlisp
(define (int->roman num , result)
(set 'result "")
(let ((numerals '((1 "I") (4 "IV") (5 "V") (9 "IX") (10 "X") (40 "XL")
(50 "L") (90 "XC") (100 "C") (400 "CD") (500 "D") (900 "CM")
(1000 "M"))))
(dolist (numeral-set (reverse numerals))
(let ((value (first numeral-set)) (numeral (last numeral-set)))
(while (>= num value)
(begin
(set 'result (join (list result numeral)))
(set 'num (- num value))))))) result)
(constant '*patterns*
'(({^\b([\w\-\']{2,})\b\s\b((?:St\. )*[\w\-\']{2,})\b$} (first-name last-name))
({^\b([\w\-\']{2,})\b\s\b([\w\-\']{2,})\b\s\b((?:St\. )*[\w\-\']{2,})\b$} (first-name middle-name last-name))
({^\b([\w\-\']{2,})\b\s\b(\w\.)\s\b((?:St\. )*[\w\-\']{2,})\b$} (first-name middle-name last-name))
({^\b(\w\.)\s\b([\w\-\']{2,})\b\s\b((?:St\. )*[\w\-\']{2,})\b$} (first-name middle-name last-name))
({^\b(\w\.)\s\b(\w\.)\s\b((?:St\. )*[\w\-\']{2,})\b$} (first-name middle-name last-name))
({^\b([\w\-\']{2,})\b\s\b(\w\.)\s\b((?:St\. )*[\w\-\']{2,}\b\s\b[\w\-\']{2,})\b$} (first-name middle-name last-name))
({^\b([\w\-\']{2,})\b\s\b(\w\.\s\w\.)\s\b((?:St\. )*[\w\-\']{2,})\b$} (first-name middle-name last-name))
({^\b([\w\-\']{2,}\b\s\b[\w\-\']{2,})\b\s\b(\w\.)\s\b((?:St\. )*[\w\-\']{2,})\b$} (first-name middle-name last-name))
({^\b([\w\-\']{2,})\b\s\b([\w\-\']{2,}\b\s\b[\w\-\']{2,})\b\s\b((?:St\. )*[\w\-\']{2,})\b$} (first-name middle-name last-name))))
(constant '*romans-list* (map 'int->roman (sequence 1 25)))
(constant '*romans* (format {\s(?P<suffix>%s)$}
(join (map (lambda (i)
(format "(?:%s)" i))
*romans-list*) "|")))
(constant '*suffixes* {\b(?P<suffix>[JjSs][Rr])\.*\b})
(set '*parse-errors* '())
(define (normalize nm)
(let ((comma-matches (find-all {,(?!(\s*[JjSs][Rr]\b\.))} nm)))
(cond ((= (length comma-matches) 2)
(replace {^\s*(.+?)\s*,\s*(.+?)\s*,\s*(.+?)\s*$} nm (format "%s %s %s" $3 $1 $2) 0))
((= (length comma-matches) 1)
(replace {^\s*(.+?)\s*,\s*(.+?)\s*$} nm (format "%s %s" $2 $1) 0)))
(replace "," nm "")
(replace {(\.)\s*} nm ". " 0)
(replace {\b(\w)\b[^$\.]} nm (format "%s. " (upper-case $1)) 0)
(replace {\b([\w\-\']+?)\b} nm (title-case $1 true) 0)
(replace {\b([JjSs][Rr])\b\.*} nm (format "%s." 0))
(replace *romans* nm (format " %s" (upper-case $1)) 1)
(set 'nm (trim nm))
(replace {\s+} nm " " 0)) nm)
(define (match-pattern str pattern , p-match)
(set 'data '((first-name nil) (middle-name nil) (last-name nil)))
(set 'p-match (regex (first pattern) str))
(if (not (nil? p-match))
(begin
(dolist (x (last pattern))
(replace-assoc x data (list x (p-match (* 3 (+ 1 $idx))))))) nil))
(define (parse-name nm , data original-name)
(set 'original-name nm)
(set 'nm (normalize nm))
(set 'data '((first-name nil) (middle-name nil)
(last-name nil) (suffix nil)))
(let ((suffix-match (regex {\b(?P<suffix>[JjSs][Rr])\.*\b} nm))) ;("Jr" 18 2 "Jr" 18 2)
(if (not (nil? suffix-match))
(begin
(replace-assoc 'suffix data (list 'suffix (suffix-match 3)))
(replace (format " %s\.*" (last (assoc 'suffix data))) nm "" 1))
(begin
(set 'suffix-match (regex *romans* nm 1))
(if (not (nil? suffix-match))
(begin
(replace-assoc 'suffix data (list 'suffix (suffix-match 3)))
(replace (format " %s" (last (assoc 'suffix data))) nm "" 1))))))
(let ((good-match (catch
(dolist (pattern *patterns*)
(let ((possible-match (match-pattern nm pattern)))
(if (not (nil? possible-match))
(throw possible-match)))))))
(if (nil? good-match)
(push original-name *parse-errors* -1)
(dolist (cell good-match)
(replace-assoc (first cell) data cell)))) data)
(load "sample-names.lsp")
(map parse-name nm-list)
(println *parse-errors*)