newLISP interface to Tidy
Posted: Sat Jun 16, 2007 7:21 pm
Tidy is a nice program which cleans up malformed (or just plain ugly) HTML. I've always wanted access to it in newLISP without having my newLISP script run it as a command line program. Well, today I finally saw that there was a library version of tidy (called TidyLib).
Here is a first cut of a tidy interface module for newLISP. The interface code is based on the example C code given in http://tidy.sourceforge.net/libintro.html.
Here's an example (assume that the above code is in file "tidy.lsp"):
Enjoy! (as Norman would say)
--Rick
Here is a first cut of a tidy interface module for newLISP. The interface code is based on the example C code given in http://tidy.sourceforge.net/libintro.html.
Code: Select all
;;;; tidy.lsp -- A module to interface TidyLib
;;;; Author: Rick Hanson
;;;; Date: 17 June 2007
(context 'tidy)
;;;---------------------------------------------------------
;;; U S E R C O N F I G U R A T I O N
;;;
;;; Read the desciptions of the following two variables,
;;; and change as appropriate for your needs.
;; This is the location of your TidyLib shared library
;; On Macs it's called libtidy.dylib, on Win32 machines
;; it's called libtidy.dll, on the Penguin and Unices it's
;; called libtidy.so.
(define libtidy "/usr/lib/libtidy.dylib")
;; According to Lutz, you probably don't need to change this.
;; Change it to 64, ONLY IF you know your TidyLib (and probably
;; the rest of your system + newLISP) is LP64.
(define machine-address-size-in-bits 32)
;;;---------------------------------------------------------
;;; B O I L E R P L A T E C O D E F O L L O W S
;;;
;;; (meaning that, if you're
;;; (a) just a user of this module AND
;;; (b) you're lucky,
;;; then you won't need to change the code below this line.)
;;; :-)
(import libtidy "tidyCreate")
(import libtidy "tidyOptSetBool")
(import libtidy "tidySetErrorBuffer")
(import libtidy "tidyParseString")
(import libtidy "tidyCleanAndRepair")
(import libtidy "tidyRunDiagnostics")
(import libtidy "tidySaveBuffer")
(import libtidy "tidyBufFree")
(import libtidy "tidyRelease")
(import libtidy "tidyReleaseDate")
(define machine-address-size-in-bytes
(/ machine-address-size-in-bits 8))
(define size-of-u_int machine-address-size-in-bytes)
(define size-of-address-pointer machine-address-size-in-bytes)
(define tidy-release-date
(let ((pd (parse (get-string (tidy:tidyReleaseDate))))
(months '("Month0" "January" "February" "March" "April"
"May" "June" "July" "August" "September"
"October" "November" "December")))
(if (= (length pd) 4)
(date-value (int (pd 3)) (find (pd 2) months) (int (pd 0)))
(date-value (int (pd 2)) (find (pd 1) months) (int (pd 0))))))
;;; Since TidyBuffer (in buffio.h) changed on 2006-12-29, this code
;;; checks to see if your TidyLib's release date is before or
;;; on-or-after this date, and tries to do the right thing. This
;;; would all be easier if the Tidy developers used version numbers.
;;;
;;; The right thing is the setup of the following two variables:
;;;
;;; empty-TidyBuffer: an allocation of enough space to account
;;; for the size of a TidyBuffer.
;;;
;;; bp-offset: the offset from the start of the TidyBuffer
;;; struct to struct member `bp', where the TidyLib text output
;;; is stored.
(let ((TidyBuffer-change-date (date-value 2006 12 29)))
(cond
((< tidy-release-date TidyBuffer-change-date)
;; struct _TidyBuffer
;; {
;; byte* bp; /**< Pointer to bytes */
;; uint size; /**< # bytes currently in use */
;; uint allocated; /**< # bytes allocated */
;; uint next; /**< Offset of current input position */
;; };
(define empty-TidyBuffer
(dup "\000" (+ size-of-address-pointer
(* 3 size-of-u_int))))
(define bp-offset 0))
(true
;; struct _TidyBuffer
;; {
;; TidyAllocator* allocator; /**< Memory allocator */
;; byte* bp; /**< Pointer to bytes */
;; uint size; /**< # bytes currently in use */
;; uint allocated; /**< # bytes allocated */
;; uint next; /**< Offset of current input position */
;; };
(define empty-TidyBuffer
(dup "\000" (+ (* 2 size-of-address-pointer)
(* 3 size-of-u_int))))
(define bp-offset size-of-address-pointer))))
;;; The following flags are recovered from tidyenum.h of
;;; TidyLib. (Fortunately, the developers did not change the enums
;;; -- the old ones should stay the same from version to version.)
(define TidyXmlOut 22) ; Output XML.
(define TidyXhtmlOut 23) ; Output extensible HTML.
(define TidyHtmlOut 24) ; Output plain HTML, even for XHTML input.
(define TidyForceOutput 64) ; Output document even if errors were found.
(define no 0)
(define yes 1)
(define (tidy:tidy output-type input)
(let ((output empty-TidyBuffer)
(output-contents nil)
(errbuf empty-TidyBuffer)
(rc -1)
(ok nil)
(tdoc (tidyCreate)))
(setq ok (tidyOptSetBool tdoc output-type yes))
(if ok (setq rc (tidySetErrorBuffer tdoc errbuf)))
(if (>= rc 0)
(setq rc (tidyParseString tdoc input)))
(if (>= rc 0)
(setq rc (tidyCleanAndRepair tdoc)))
(if (>= rc 0)
(setq rc (tidyRunDiagnostics tdoc)))
(if (> rc 1)
(setq rc (if (not (= 0 (tidyOptSetBool tdoc
TidyForceOutput
yes)))
rc -1)))
(if (>= rc 0)
(setq rc (tidySaveBuffer tdoc output)))
(if (>= rc 0)
(setq output-contents
(get-string
(first
(unpack "lu"
(bp-offset size-of-address-pointer output)))))
(println (format "A severe error (%d) occurred.\n" rc)))
(tidyBufFree output)
(tidyBufFree errbuf)
(tidyRelease tdoc)
output-contents))
(define xml<- (curry tidy TidyXmlOut))
(define xhtml<- (curry tidy TidyXhtmlOut))
(define html<- (curry tidy TidyHtmlOut))
(context MAIN)
Code: Select all
> (load "tidy.lsp")
MAIN
> (print (tidy:xhtml<- "<title>Foo</title><p>Foo!"))
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta name="generator" content=
"HTML Tidy for Mac OS X (vers 1st December 2004), see www.w3.org" />
<title>Foo</title>
</head>
<body>
<p>Foo!</p>
</body>
</html>
--Rick