Code: Select all
; mparse_words_asc_lower_case.lsp dmemos 11.jan.2009
( silent ; no console-echo
( println ( date ) )
( change-dir "C:\\Documents and Settings\\didi\\My Documents\\newLISP" )
( set 'src_txt ( read-file "wrnpc12.txt" )) ; war_and_peace
( set 'word_char [text]abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_[/text])
( set 'sep_char [text] \n\r\t!\"#$%'()*+,-./:;<=>?@[]~"&/\\[/text])
( set 'lineout "" )
( set 'out_lst '() )
( replace "\r\n" src_txt " " ) ; replace all CR-LF with " "
( set 'src_txt ( lower-case src_txt ))
( while ( < 0 (length src_txt) )
( set 'x ( pop src_txt ))
( if ( find x word_char )
( push x lineout -1) ; word_char found
( if ( find x sep_char ) ; else no word_char
( if ( < 0 (length lineout))
( begin
( push lineout out_lst -1)
( set 'lineout "")))))
)
( set 'word_list (sort (unique out_lst)))
( write-file "word_list_wrnpc12.txt" (string word_list) )
)
( println ( date ))
( println "bye " )
I tried some extreme big texts , 600kB of text needs 7min . The book "war and peace" is over 3MB in ascii-text , is it faster to divide it in parts and then join the lists ?