



Tokenise__ (lines 755-800)
Back to List
Browsing parserm.h
0755 [ Tokenise__ buf tab
0756 cx numwords len bx ix wx wpos wlen val res dictlen entrylen;
0757 len = buf-->0;
0758 buf = buf+WORDSIZE;
0759
0760 ! First, split the buffer up into words. We use the standard Infocom
0761 ! list of word separators (comma, period, double-quote).
0762
0763 cx = 0;
0764 numwords = 0;
0765 while (cx < len) {
0766 while (cx < len && buf->cx == ' ') cx++;
0767 if (cx >= len) break;
0768 bx = cx;
0769 if (buf->cx == '.' or ',' or '"') cx++;
0770 else {
0771 while (cx < len && buf->cx ~= ' ' or '.' or ',' or '"') cx++;
0772 }
0773 tab-->(numwords*3+2) = (cx-bx);
0774 tab-->(numwords*3+3) = WORDSIZE+bx;
0775 numwords++;
0776 if (numwords >= MAX_BUFFER_WORDS) break;
0777 }
0778 tab-->0 = numwords;
0779
0780 ! Now we look each word up in the dictionary.
0781
0782 dictlen = #dictionary_table-->0;
0783 entrylen = DICT_WORD_SIZE + 7;
0784
0785 for (wx=0 : wx<numwords : wx++) {
0786 wlen = tab-->(wx*3+2);
0787 wpos = tab-->(wx*3+3);
0788
0789 ! Copy the word into the gg_tokenbuf array, clipping to DICT_WORD_SIZE
0790 ! characters and lower case.
0791 if (wlen > DICT_WORD_SIZE) wlen = DICT_WORD_SIZE;
0792 cx = wpos - WORDSIZE;
0793 for (ix=0 : ix<wlen : ix++) gg_tokenbuf->ix = glk($00A0, buf->(cx+ix));
0794 for (: ix<DICT_WORD_SIZE : ix++) gg_tokenbuf->ix = 0;
0795
0796 val = #dictionary_table + WORDSIZE;
0797 @binarysearch gg_tokenbuf DICT_WORD_SIZE val entrylen dictlen 1 1 res;
0798 tab-->(wx*3+1) = res;
0799 }
0800 ];
Last updated 27 February 2004. The librarian in charge of this page is Graham Nelson (graham@gnelson.demon.co.uk) assisted by C Knight. Please email any comments, suggestions or corrections to cedenqs@inform-fiction.org.