diff options
Diffstat (limited to 'rgx.c')
-rw-r--r-- | rgx.c | 661 |
1 files changed, 661 insertions, 0 deletions
@@ -0,0 +1,661 @@ +#include "scm.h" +#include "regex.h" +#include <stdio.h> + +static char rcsid[] = + "$Id: rgx.c, v 1.20 1995/02/15 04:39:45 dpb Exp $"; + +#ifdef HAVE_ALLOCA +# include <alloca.h> +# define ALLOCA_PROTECT typedef int foobazzz +# define ALLOCA alloca +#else +# define ALLOCA_PROTECT SCM alloca_protect=EOL +# define ALLOCA(size) \ + (alloca_protect=cons(makstr((long)(size)), alloca_protect), \ + (void *)CDR(CAR(alloca_protect))) + +#endif + +#ifdef _GNU_SOURCE +/* following two lines stolen from GNU regex.c */ +# define CHAR_SET_SIZE 256 +# define ISUPPER(c) (isascii (c) && isupper (c)) +#endif + +/* forward function defs */ + +SCM lregsearch(); +SCM lregsearchv(); + +/* Posix regexp bindings. */ + +static char s_regex[] = "regex"; +static char s_regcomp[] = "regcomp", s_regerror[] = "regerror"; +static char s_regexec[] = "regexec", s_regmatp[] = "regmatch?"; +static char s_regsearch[] = "regsearch", s_regmatch[] = "regmatch"; +static char s_regsearchv[] = "regsearchv", s_regmatchv[] = "regmatchv"; +static char s_stringsplit[] = "string-split"; +static char s_stringsplitv[] = "string-splitv"; +static char s_stringedit[] = "string-edit"; + +#define s_error &s_regerror[3] + +#define RGX_INFO(obj) ((regex_info*)CDR(obj)) +#define RGX_PATTERN(obj) (((regex_info*)CDR(obj))->pattern) +#define RGX(obj) (&((regex_info*)CDR(obj))->rgx) +#ifndef _GNU_SOURCE +# define RGX2(obj) (&((regex_info*)CDR(obj))->rgx_anchored) +#endif + +#define FIXUP_REGEXP(prog) \ +{ \ + if (STRINGP(prog)) \ + prog = lregcomp(prog, UNDEFINED); \ + if (NIMP(prog) && CONSP(prog) && STRINGP(CAR(prog)) && \ + NIMP(CDR(prog)) && CONSP(CDR(prog)) && STRINGP(CAR(CDR(prog)))) \ + prog = lregcomp(CAR(prog), CAR(CDR(prog))); \ +} + +typedef struct regex_info { + SCM pattern; /* string we compiled to create our reg exp */ + regex_t rgx; +#ifndef _GNU_SOURCE + int options; /* for anchored pattern when matching */ + regex_t rgx_anchored; +#endif +} regex_info; + +sizet fregex(ptr) + CELLPTR ptr; +{ + regfree(RGX(ptr)); +#ifndef _GNU_SOURCE + /* options are null => we compiled the anchored pattern */ + if (RGX_INFO(ptr)->options==NULL) + regfree(RGX2(ptr)); +#endif + free(CHARS(ptr)); + return sizeof(regex_t); +} + +int prinregex(exp, port, writing) + SCM exp; SCM port; int writing; +{ + lputs("#<regex ", port); + intprint(CDR(exp), 16, port); + lputc(' ', port); + iprin1(RGX_PATTERN(exp), port, writing); + lputc('>', port); + return 1; +} + +SCM markregex(ptr) + SCM ptr; +{ + SETGC8MARK(ptr); + SETGC8MARK(RGX_PATTERN(ptr)); + return BOOL_F; +} + +int tc16_rgx; +static smobfuns rgxsmob = {markregex, fregex, prinregex}; + +SCM lregerror(scode) + SCM scode; +{ + int code, len; + SCM str; + ASSERT(INUMP(scode), scode, ARG1, s_regerror); + code = INUM(scode); + if (code < 0) + return makfromstr("Invalid code", sizeof("Invalid code")-1); + /* XXX - is regerror posix or not? */ +#ifdef __REGEXP_LIBRARY_H__ + /* XXX - gnu regexp doesn't use the re parameter, so we will + ignore it in a very untidy way. */ + len = regerror(code, 0, 0, 0); + str = makstr(len-1); + regerror(code, 0, CHARS(str), len); +#else + str = makfromstr(s_error, (sizet)5); +#endif + return str; +} + +SCM lregcomp(pattern, flags) + SCM pattern, flags; +{ + SCM z; + int i, options; + regex_t *prog; + regex_info *info; + char *flagchars; +#ifdef _GNU_SOURCE + int fastmap = 0; + int ignore_case = 0; + char *err_msg; +#endif + + ASSERT(NIMP(pattern) && STRINGP(pattern), pattern, ARG1, s_regcomp); + ASSERT(UNBNDP(flags) || (NIMP(flags) && STRINGP(flags)), + flags, ARG2, s_regcomp); + NEWCELL(z); + DEFER_INTS; + SETCHARS(z, info=(regex_info*)must_malloc((long)sizeof(regex_info), s_regex)); + prog = &(info->rgx); + CAR(z) = tc16_rgx; +#ifdef __REGEXP_LIBRARY_H__ + for(i=sizeof(regex_t);i--;((char *)prog)[i] = 0); +# ifndef _GNU_SOURCE + { + regex_t *prog2; + prog2 = &(info->rgx_anchored); + for(i=sizeof(regex_t);i--;((char *)prog2)[i] = 0); + } +# endif +#endif + + ALLOW_INTS; + info->pattern = pattern; + +#ifdef _GNU_SOURCE + options = RE_SYNTAX_POSIX_EXTENDED; +#else + options = REG_EXTENDED; +#endif + + if (!UNBNDP(flags)) { + flagchars = CHARS(flags); + for (i=0; i<LENGTH(flags); i++) + switch (flagchars[i]) { +#ifdef _GNU_SOURCE + case 'n': + options |= RE_HAT_LISTS_NOT_NEWLINE; + options &= (~RE_DOT_NEWLINE); + break; + case 'i': + ignore_case = 1; + break; + case '0': + options &= (~RE_DOT_NOT_NULL); + break; + case 'f': + fastmap = 1; + break; +#else + case 'n': + options |= REG_NEWLINE; + break; + case 'i': + options |= REG_ICASE; + break; +#endif + } + } + +#ifdef _GNU_SOURCE + DEFER_INTS; + if (fastmap) + prog->fastmap = must_malloc(CHAR_SET_SIZE, s_regex); + + if (ignore_case) { + prog->translate = must_malloc(CHAR_SET_SIZE, s_regex); + for (i = 0; i < CHAR_SET_SIZE; i++) + prog->translate[i] = ISUPPER (i) ? tolower (i) : i; + } + + re_set_syntax(options); + err_msg = (char *)re_compile_pattern(CHARS(pattern), LENGTH(pattern), prog); + ALLOW_INTS; + prog->regs_allocated = REGS_FIXED; + + /* if error, compile using regcomp to get the error number */ + if (err_msg) { + int i; + char *tmppat; + SCM protect; + + /* Fixup in case pattern has null characters */ + tmppat = CHARS(protect=makstr(LENGTH(pattern))); + bcopy(CHARS(pattern), tmppat, LENGTH(pattern)); + for (i=0; i<LENGTH(pattern); i++) + if (tmppat[i] == 0) + tmppat[i] = ' '; + + i = regcomp(prog, tmppat, options); + z = MAKINUM(i); + } +#else + info->options = options; + i = regcomp(prog, CHARS(pattern), options); + if (i) z = MAKINUM(i); +#endif + return z; +} + +SCM lregexec(prog, str) + SCM prog, str; +{ + ALLOCA_PROTECT; + + FIXUP_REGEXP(prog); + ASSERT(NIMP(prog) && tc16_rgx==CAR(prog), prog, ARG1, s_regexec); + ASSERT(NIMP(str) && STRINGP(str), str, ARG2, s_regexec); + +#ifdef _GNU_SOURCE + return lregsearchv(prog, str, EOL); +#else /* not _GNU_SOURCE */ + { + size_t nsub; + SCM ans; + regmatch_t *pm; + int flags = 0; /* XXX - optional arg? */ + + nsub = RGX(prog)->re_nsub + 1; /* XXX - is this posix? */ + pm = ALLOCA(nsub * sizeof(regmatch_t)); + if (regexec(RGX(prog), CHARS(str), nsub, pm, flags) != 0) + ans = BOOL_F; + else { + ans = make_vector(MAKINUM(2L * nsub), MAKINUM(-1L)); + while (nsub--) { + VELTS(ans)[2*nsub+0] = MAKINUM(pm[nsub].rm_so); + VELTS(ans)[2*nsub+1] = MAKINUM(pm[nsub].rm_eo); + } + } + return ans; + } +#endif /* _GNU_SOURCE */ +} + +SCM lregmatp(prog, str) + SCM prog, str; +{ + FIXUP_REGEXP(prog); + ASSERT(NIMP(prog) && tc16_rgx==CAR(prog), prog, ARG1, s_regmatp); + ASSERT(NIMP(str) && STRINGP(str), str, ARG2, s_regmatp); + +#ifdef _GNU_SOURCE + return (lregsearch(prog, str, EOL)==BOOL_F)?BOOL_F:BOOL_T; +#else /* not _GNU_SOURCE */ + { + int flags = 0; /* XXX - optional arg? */ + + flags = regexec(RGX(prog), CHARS(str), 0, NULL, flags); + if (!flags) return BOOL_T; + if (REG_NOMATCH!=flags) wta(MAKINUM(flags), s_error, s_regmatp); + return BOOL_F; + } +#endif +} + +#define SCALAR 0 +#define VECTOR 1 + +#define MATCH 0 +#define SEARCH 1 + +SCM lregsearchmatch(prog, str, args, search, vector) + SCM prog, str, args; + int vector, search; +{ + int len = ilength(args); + int start, size, nsub; + SCM matches; + ALLOCA_PROTECT; + + FIXUP_REGEXP(prog); + ASSERT(NIMP(prog) && tc16_rgx==CAR(prog), prog, ARG1, s_regsearch); + ASSERT(NIMP(str) && STRINGP(str), str, ARG2, s_regsearch); + ASSERT(len<=2, args, WNA, s_regsearch); + ASSERT((len<1)||(INUMP(CAR(args))), CAR(args), ARG3, s_regsearch); + ASSERT((len<2)||(INUMP(CAR(CDR(args)))), CAR(CDR(args)), ARG4, s_regsearch); + + start = (len>=1)?(INUM(CAR(args))):0; + size = (len>=2)?(INUM(CAR(CDR(args)))):LENGTH(str); + +#ifdef _GNU_SOURCE + { + int ret, dir=1; + struct re_registers regs, *pregs=NULL; + + if (search && start<0) + start *= -1, dir = -1; + + if (vector) { + pregs = ®s; + nsub = RGX(prog)->re_nsub + 1; + regs.num_regs = nsub; + + regs.start = ALLOCA(nsub * sizeof(regoff_t)); + regs.end = ALLOCA(nsub * sizeof(regoff_t)); + } + + if (search) + ret = re_search(RGX(prog), CHARS(str), size, start, dir*size, pregs); + else + ret = re_match(RGX(prog), CHARS(str), size, start, pregs); + + if (ret < 0) + return BOOL_F; + + if (!vector) + return MAKINUM(ret); + + matches = make_vector(MAKINUM(2L * nsub), MAKINUM(-1L)); + while (nsub--) { + VELTS(matches)[2*nsub+0] = MAKINUM(regs.start[nsub]); + VELTS(matches)[2*nsub+1] = MAKINUM(regs.end[nsub]); + } + return matches; + } +#else /* not _GNU_SOURCE */ + { + regex_t *regexp; + regmatch_t *pm; + char *search_string; + if (size > LENGTH(str)) + size = LENGTH(str); + + if (start<0 || start >= size) + return BOOL_F; + + if (size < LENGTH(str)) { + search_string = ALLOCA(size-start+1); + bcopy(CHARS(str)+start, search_string, size-start); + search_string[size-start] = 0; + } else + search_string = CHARS(str)+start; + + nsub = RGX(prog)->re_nsub + 1; + pm = ALLOCA(nsub * sizeof(regmatch_t)); + if (search) + regexp = RGX(prog); + else { + /* doing a match */ + if (RGX_INFO(prog)->options) { + /* strlen & strcpy OK, posix patterns are null terminated */ + char *pattern; + + pattern = ALLOCA(strlen(CHARS(RGX_PATTERN(prog)))+2); + pattern[0] = '^'; + strcpy(pattern+1, CHARS(RGX_PATTERN(prog))); + regcomp(RGX2(prog), pattern, RGX_INFO(prog)->options); + RGX_INFO(prog)->options = 0; + } + regexp = RGX2(prog); + } + + if (regexec(regexp, search_string, nsub, pm, 0) != 0) + return BOOL_F; + + if (vector) { + matches = make_vector(MAKINUM(2L * nsub), MAKINUM(-1L)); + while (nsub--) { + VELTS(matches)[2*nsub+0] = MAKINUM(pm[nsub].rm_so + start); + VELTS(matches)[2*nsub+1] = MAKINUM(pm[nsub].rm_eo + start); + } + return matches; + } + + if (search) + return MAKINUM(pm[0].rm_so + start); + else + return MAKINUM(pm[0].rm_eo - pm[0].rm_so); + } + +#endif /* _GNU_SOURCE */ +} + +SCM lregsearch(prog, str, args) + SCM prog, str, args; +{ + return lregsearchmatch(prog, str, args, SEARCH, SCALAR); +} + +SCM lregsearchv(prog, str, args) + SCM prog, str, args; +{ + return lregsearchmatch(prog, str, args, SEARCH, VECTOR); +} + +SCM lregmatch(prog, str, args) + SCM prog, str, args; +{ + return lregsearchmatch(prog, str, args, MATCH, SCALAR); +} + +SCM lregmatchv(prog, str, args) + SCM prog, str, args; +{ + return lregsearchmatch(prog, str, args, MATCH, VECTOR); +} + +SCM stringsplitutil(prog, str, vector) + SCM prog, str; + int vector; +{ + int anchor, match_start, match_end, num_substrings, num_elements; + int search_base; + SCM next_break, substrings, ret; + SCM st_start, st_end; + + FIXUP_REGEXP(prog); + ASSERT(NIMP(prog) && tc16_rgx==CAR(prog), prog, ARG1, s_stringsplit); + ASSERT(NIMP(str) && STRINGP(str), str, ARG2, s_stringsplit); + + substrings = EOL; + anchor = 0; + search_base = 0; + num_substrings = 0; + next_break = lregsearchv(prog, str, cons(MAKINUM(search_base), EOL)); + + while (next_break != BOOL_F) { + match_start = INUM(VELTS(next_break)[0]); + match_end = INUM(VELTS(next_break)[1]); + + if (match_start < match_end) { + substrings=cons2(MAKINUM(anchor), MAKINUM(match_start), substrings); + anchor = match_end; + num_substrings++; + } + + search_base = ((match_end>search_base)?match_end:(search_base+1)); + next_break = lregsearchv(prog, str, cons(MAKINUM(search_base), EOL)); + } + + /* get that tail bit */ + if (anchor < LENGTH(str)) { + substrings = cons2(MAKINUM(anchor), MAKINUM(LENGTH(str)), substrings); + num_substrings++; + } + + num_elements = vector?(2*num_substrings):num_substrings; + ret = make_vector(MAKINUM(num_elements), EOL); + + while (num_substrings--) { + st_start = CAR(substrings); + st_end = CAR(CDR(substrings)); + + if (vector) { + VELTS(ret)[num_substrings*2+0] = st_start; + VELTS(ret)[num_substrings*2+1] = st_end; + } else + VELTS(ret)[num_substrings] = substring(str, st_start, st_end); + + substrings = CDR(CDR(substrings)); + } + + return ret; +} + +SCM lstringsplit(prog, str) + SCM prog, str; +{ + return stringsplitutil(prog, str, SCALAR); +} + +SCM lstringsplitv(prog, str) + SCM prog, str; +{ + return stringsplitutil(prog, str, VECTOR); +} + +typedef struct _item { + struct _item *next; + char *string; + int start; + int end; +} *editItem; + +#define PUSH(list, string_parm, start_parm, end_parm) \ + { \ + editItem item; \ + \ + item = ALLOCA(sizeof(*item)); \ + item->next = list; \ + list = item; \ + item->string = string_parm; \ + item->start = start_parm; \ + item->end = end_parm; \ + } + +/* (string-edit <re> <edit-spec> <string> [<count>]) */ +SCM lstringedit(prog, editspec, args) + SCM prog, editspec, args; +{ + int match_start, match_end, search_base, editcount; + int total_len; + int i, args_len, anchor, maxsubnum; + int backslash; + char *ptr; + editItem editlist, substrings, edit; + SCM str, count, next_edit; + SCM result; + ALLOCA_PROTECT; + + args_len = ilength(args); + + FIXUP_REGEXP(prog); + ASSERT(NIMP(prog) && tc16_rgx==CAR(prog), prog, ARG1, s_stringedit); + ASSERT(NIMP(editspec) && STRINGP(editspec), editspec, ARG2, s_stringedit); + ASSERT((args_len==1)||(args_len==2), args, WNA, s_stringedit); + + str = CAR(args); + ASSERT(NIMP(str)&&STRINGP(str), str, ARG3, s_stringedit); + + if (args_len==2) { + count = CAR(CDR(args)); + ASSERT(INUMP(count)||(count==BOOL_T), count, ARG4, s_stringedit); + } else + count = MAKINUM(1); + + /* process the editspec - break it into a list of dotted pairs + * of integers for substrings to be inserted and + * integers representing matched subexpressions that + * should be inserted. + */ + + maxsubnum = RGX(prog)->re_nsub; + anchor = 0; + backslash = 0; + editlist = NULL; + ptr = CHARS(editspec); + + for (i=0; i<LENGTH(editspec); i++) { + if (backslash && (ptr[i]>='0') && (ptr[i] <='9') && + ((ptr[i]-'0')<=maxsubnum)) + { + if ((i-1)>anchor) + PUSH(editlist, CHARS(editspec), anchor, i-1); + + PUSH(editlist, CHARS(editspec), ptr[i]-'0', -1); + anchor = i+1; + } + backslash = (ptr[i] == '\\')?1:0; + } + + if (anchor < LENGTH(editspec)) + PUSH(editlist, CHARS(editspec), anchor, LENGTH(editspec)); + + /* now, reverse the list of edit items */ + { + editItem prev, cur, next; + + for (prev=NULL, cur=editlist; cur; prev=cur, cur=next) { + next = cur->next; + cur->next = prev; + } + editlist = prev; + } + + anchor = 0; + search_base = 0; + editcount = 0; + substrings = NULL; + + next_edit = lregsearchv(prog, str, cons(MAKINUM(search_base), EOL)); + + while (next_edit != BOOL_F) { + if (INUMP(count) && (editcount==INUM(count))) + break; + + match_start = INUM(VELTS(next_edit)[0]); + match_end = INUM(VELTS(next_edit)[1]); + + if (match_start < match_end) { + PUSH(substrings, CHARS(str), anchor, match_start); + anchor = match_end; + } + + for (edit=editlist; edit; edit=edit->next) { + if (edit->end == -1) { + /* A backslash number in the original editspec */ + PUSH(substrings, CHARS(str), + INUM(VELTS(next_edit)[edit->start*2+0]), + INUM(VELTS(next_edit)[edit->start*2+1])); + } else + /* normal string in the editspec */ + PUSH(substrings, edit->string, edit->start, edit->end); + } + + editcount++; + search_base = ((match_end>search_base)?match_end:(search_base+1)); + next_edit = lregsearchv(prog, str, cons(MAKINUM(search_base), EOL)); + } + + /* get that tail bit */ + if (anchor < LENGTH(str)) + PUSH(substrings, CHARS(str), anchor, LENGTH(str)); + + /* assemble the result string */ + for (edit=substrings, total_len=0; edit; edit=edit->next) + total_len += (edit->end - edit->start); + + result = makstr(total_len); + ptr = CHARS(result) + total_len; /* point at the null at the end */ + + for (edit=substrings; edit; edit=edit->next) { + ptr -= (edit->end - edit->start); + bcopy(edit->string + edit->start, ptr, edit->end - edit->start); + } + return result; +} +#undef PUSH + +void init_rgx() +{ + tc16_rgx = newsmob(&rgxsmob); + make_subr(s_regcomp, tc7_subr_2o, lregcomp); + make_subr(s_regexec, tc7_subr_2, lregexec); + make_subr(s_regmatp, tc7_subr_2, lregmatp); + make_subr(s_regerror, tc7_subr_1, lregerror); + make_subr(s_regsearch, tc7_lsubr_2, lregsearch); + make_subr(s_regsearchv, tc7_lsubr_2, lregsearchv); + make_subr(s_regmatch, tc7_lsubr_2, lregmatch); + make_subr(s_regmatchv, tc7_lsubr_2, lregmatchv); + make_subr(s_stringsplit, tc7_subr_2, lstringsplit); + make_subr(s_stringsplitv, tc7_subr_2, lstringsplitv); + make_subr(s_stringedit, tc7_lsubr_2, lstringedit); + add_feature(s_regex); +} |